2007-07-09 06:27:46 +00:00
|
|
|
from zope.interface import implements
|
2006-12-14 03:32:35 +00:00
|
|
|
from twisted.trial import unittest
|
2008-07-14 22:25:21 +00:00
|
|
|
from twisted.internet import defer, reactor
|
|
|
|
from twisted.internet.interfaces import IConsumer
|
2007-04-16 23:30:21 +00:00
|
|
|
from twisted.python.failure import Failure
|
2008-01-28 19:14:48 +00:00
|
|
|
from foolscap import eventual
|
2008-07-16 20:14:39 +00:00
|
|
|
from allmydata import hashtree, uri
|
|
|
|
from allmydata.immutable import encode, upload, download
|
2008-12-05 20:07:23 +00:00
|
|
|
from allmydata.util import hashutil
|
2007-07-24 02:31:53 +00:00
|
|
|
from allmydata.util.assertutil import _assert
|
2008-10-27 20:34:49 +00:00
|
|
|
from allmydata.interfaces import IStorageBucketWriter, IStorageBucketReader, NotEnoughSharesError
|
2009-01-08 21:42:15 +00:00
|
|
|
from allmydata.monitor import Monitor
|
2008-10-29 04:28:31 +00:00
|
|
|
import common_util as testutil
|
2006-12-14 03:32:35 +00:00
|
|
|
|
2007-06-06 17:32:40 +00:00
|
|
|
class LostPeerError(Exception):
|
|
|
|
pass
|
|
|
|
|
2007-06-08 04:24:39 +00:00
|
|
|
def flip_bit(good): # flips the last bit
|
|
|
|
return good[:-1] + chr(ord(good[-1]) ^ 0x01)
|
|
|
|
|
2007-11-20 02:07:10 +00:00
|
|
|
class FakeClient:
|
|
|
|
def log(self, *args, **kwargs):
|
|
|
|
pass
|
|
|
|
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
class FakeBucketReaderWriterProxy:
|
2007-07-09 06:27:46 +00:00
|
|
|
implements(IStorageBucketWriter, IStorageBucketReader)
|
2007-04-16 23:30:21 +00:00
|
|
|
# these are used for both reading and writing
|
|
|
|
def __init__(self, mode="good"):
|
|
|
|
self.mode = mode
|
2007-03-30 18:32:57 +00:00
|
|
|
self.blocks = {}
|
2008-03-23 21:46:49 +00:00
|
|
|
self.plaintext_hashes = []
|
|
|
|
self.crypttext_hashes = []
|
2007-03-30 20:20:01 +00:00
|
|
|
self.block_hashes = None
|
|
|
|
self.share_hashes = None
|
2007-03-30 18:32:57 +00:00
|
|
|
self.closed = False
|
|
|
|
|
2008-03-04 03:30:35 +00:00
|
|
|
def get_peerid(self):
|
|
|
|
return "peerid"
|
|
|
|
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
def _start(self):
|
2008-01-28 19:14:48 +00:00
|
|
|
if self.mode == "lost-early":
|
|
|
|
f = Failure(LostPeerError("I went away early"))
|
|
|
|
return eventual.fireEventually(f)
|
2007-07-13 21:04:49 +00:00
|
|
|
return defer.succeed(self)
|
2007-03-30 18:32:57 +00:00
|
|
|
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
def put_header(self):
|
|
|
|
return self._start()
|
|
|
|
|
2007-03-30 18:32:57 +00:00
|
|
|
def put_block(self, segmentnum, data):
|
2008-01-28 19:14:48 +00:00
|
|
|
if self.mode == "lost-early":
|
|
|
|
f = Failure(LostPeerError("I went away early"))
|
|
|
|
return eventual.fireEventually(f)
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
|
|
|
assert segmentnum not in self.blocks
|
|
|
|
if self.mode == "lost" and segmentnum >= 1:
|
|
|
|
raise LostPeerError("I'm going away now")
|
|
|
|
self.blocks[segmentnum] = data
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-06-06 17:32:40 +00:00
|
|
|
|
2007-06-07 02:40:20 +00:00
|
|
|
def put_plaintext_hashes(self, hashes):
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
2008-03-23 21:46:49 +00:00
|
|
|
assert not self.plaintext_hashes
|
2007-07-09 06:27:46 +00:00
|
|
|
self.plaintext_hashes = hashes
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-06-07 02:40:20 +00:00
|
|
|
|
|
|
|
def put_crypttext_hashes(self, hashes):
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
2008-03-23 21:46:49 +00:00
|
|
|
assert not self.crypttext_hashes
|
2007-07-09 06:27:46 +00:00
|
|
|
self.crypttext_hashes = hashes
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-06-07 02:40:20 +00:00
|
|
|
|
2007-03-30 18:32:57 +00:00
|
|
|
def put_block_hashes(self, blockhashes):
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
|
|
|
assert self.block_hashes is None
|
|
|
|
self.block_hashes = blockhashes
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-11-01 22:25:00 +00:00
|
|
|
|
2007-03-30 18:32:57 +00:00
|
|
|
def put_share_hashes(self, sharehashes):
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
|
|
|
assert self.share_hashes is None
|
|
|
|
self.share_hashes = sharehashes
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-03-30 18:32:57 +00:00
|
|
|
|
2007-06-08 22:59:16 +00:00
|
|
|
def put_uri_extension(self, uri_extension):
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
|
|
|
self.uri_extension = uri_extension
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-06-02 01:48:01 +00:00
|
|
|
|
2007-03-30 18:32:57 +00:00
|
|
|
def close(self):
|
2007-07-09 06:27:46 +00:00
|
|
|
def _try():
|
|
|
|
assert not self.closed
|
|
|
|
self.closed = True
|
|
|
|
return defer.maybeDeferred(_try)
|
2007-03-30 18:32:57 +00:00
|
|
|
|
2008-01-15 04:22:55 +00:00
|
|
|
def abort(self):
|
|
|
|
return defer.succeed(None)
|
|
|
|
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
def get_block_data(self, blocknum, blocksize, size):
|
|
|
|
d = self._start()
|
|
|
|
def _try(unused=None):
|
2007-07-09 06:27:46 +00:00
|
|
|
assert isinstance(blocknum, (int, long))
|
|
|
|
if self.mode == "bad block":
|
|
|
|
return flip_bit(self.blocks[blocknum])
|
|
|
|
return self.blocks[blocknum]
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d.addCallback(_try)
|
|
|
|
return d
|
2007-03-30 20:20:01 +00:00
|
|
|
|
2007-06-07 07:15:41 +00:00
|
|
|
def get_plaintext_hashes(self):
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d = self._start()
|
|
|
|
def _try(unused=None):
|
2007-07-09 06:27:46 +00:00
|
|
|
hashes = self.plaintext_hashes[:]
|
|
|
|
return hashes
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d.addCallback(_try)
|
|
|
|
return d
|
2007-06-08 02:32:29 +00:00
|
|
|
|
2007-06-07 07:15:41 +00:00
|
|
|
def get_crypttext_hashes(self):
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d = self._start()
|
|
|
|
def _try(unused=None):
|
2007-07-09 06:27:46 +00:00
|
|
|
hashes = self.crypttext_hashes[:]
|
|
|
|
if self.mode == "bad crypttext hashroot":
|
|
|
|
hashes[0] = flip_bit(hashes[0])
|
|
|
|
if self.mode == "bad crypttext hash":
|
|
|
|
hashes[1] = flip_bit(hashes[1])
|
|
|
|
return hashes
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d.addCallback(_try)
|
|
|
|
return d
|
2007-06-07 07:15:41 +00:00
|
|
|
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
def get_block_hashes(self, at_least_these=()):
|
|
|
|
d = self._start()
|
|
|
|
def _try(unused=None):
|
2007-07-09 06:27:46 +00:00
|
|
|
if self.mode == "bad blockhash":
|
|
|
|
hashes = self.block_hashes[:]
|
|
|
|
hashes[1] = flip_bit(hashes[1])
|
|
|
|
return hashes
|
|
|
|
return self.block_hashes
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d.addCallback(_try)
|
|
|
|
return d
|
2007-07-09 06:27:46 +00:00
|
|
|
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
def get_share_hashes(self, at_least_these=()):
|
|
|
|
d = self._start()
|
|
|
|
def _try(unused=None):
|
2007-07-09 06:27:46 +00:00
|
|
|
if self.mode == "bad sharehash":
|
|
|
|
hashes = self.share_hashes[:]
|
|
|
|
hashes[1] = (hashes[1][0], flip_bit(hashes[1][1]))
|
|
|
|
return hashes
|
|
|
|
if self.mode == "missing sharehash":
|
|
|
|
# one sneaky attack would be to pretend we don't know our own
|
|
|
|
# sharehash, which could manage to frame someone else.
|
|
|
|
# download.py is supposed to guard against this case.
|
|
|
|
return []
|
|
|
|
return self.share_hashes
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d.addCallback(_try)
|
|
|
|
return d
|
2007-03-30 20:20:01 +00:00
|
|
|
|
2007-06-08 22:59:16 +00:00
|
|
|
def get_uri_extension(self):
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d = self._start()
|
|
|
|
def _try(unused=None):
|
2007-07-09 06:27:46 +00:00
|
|
|
if self.mode == "bad uri_extension":
|
|
|
|
return flip_bit(self.uri_extension)
|
|
|
|
return self.uri_extension
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
d.addCallback(_try)
|
|
|
|
return d
|
2007-06-08 02:32:29 +00:00
|
|
|
|
2007-03-30 20:20:01 +00:00
|
|
|
|
2007-04-17 19:29:56 +00:00
|
|
|
def make_data(length):
|
|
|
|
data = "happy happy joy joy" * 100
|
|
|
|
assert length <= len(data)
|
|
|
|
return data[:length]
|
|
|
|
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
class ValidatedExtendedURIProxy(unittest.TestCase):
|
|
|
|
K = 4
|
|
|
|
M = 10
|
|
|
|
SIZE = 200
|
|
|
|
SEGSIZE = 72
|
|
|
|
_TMP = SIZE%SEGSIZE
|
|
|
|
if _TMP == 0:
|
|
|
|
_TMP = SEGSIZE
|
|
|
|
if _TMP % K != 0:
|
|
|
|
_TMP += (K - (_TMP % K))
|
|
|
|
TAIL_SEGSIZE = _TMP
|
|
|
|
_TMP = SIZE / SEGSIZE
|
|
|
|
if SIZE % SEGSIZE != 0:
|
|
|
|
_TMP += 1
|
|
|
|
NUM_SEGMENTS = _TMP
|
|
|
|
mindict = { 'segment_size': SEGSIZE,
|
|
|
|
'crypttext_root_hash': '0'*hashutil.CRYPTO_VAL_SIZE,
|
|
|
|
'share_root_hash': '1'*hashutil.CRYPTO_VAL_SIZE }
|
|
|
|
optional_consistent = { 'crypttext_hash': '2'*hashutil.CRYPTO_VAL_SIZE,
|
|
|
|
'codec_name': "crs",
|
|
|
|
'codec_params': "%d-%d-%d" % (SEGSIZE, K, M),
|
|
|
|
'tail_codec_params': "%d-%d-%d" % (TAIL_SEGSIZE, K, M),
|
|
|
|
'num_segments': NUM_SEGMENTS,
|
|
|
|
'size': SIZE,
|
|
|
|
'needed_shares': K,
|
|
|
|
'total_shares': M,
|
|
|
|
'plaintext_hash': "anything",
|
|
|
|
'plaintext_root_hash': "anything", }
|
|
|
|
# optional_inconsistent = { 'crypttext_hash': ('2'*(hashutil.CRYPTO_VAL_SIZE-1), "", 77),
|
|
|
|
optional_inconsistent = { 'crypttext_hash': (77,),
|
|
|
|
'codec_name': ("digital fountain", ""),
|
|
|
|
'codec_params': ("%d-%d-%d" % (SEGSIZE, K-1, M),
|
|
|
|
"%d-%d-%d" % (SEGSIZE-1, K, M),
|
|
|
|
"%d-%d-%d" % (SEGSIZE, K, M-1)),
|
|
|
|
'tail_codec_params': ("%d-%d-%d" % (TAIL_SEGSIZE, K-1, M),
|
|
|
|
"%d-%d-%d" % (TAIL_SEGSIZE-1, K, M),
|
|
|
|
"%d-%d-%d" % (TAIL_SEGSIZE, K, M-1)),
|
|
|
|
'num_segments': (NUM_SEGMENTS-1,),
|
|
|
|
'size': (SIZE-1,),
|
|
|
|
'needed_shares': (K-1,),
|
|
|
|
'total_shares': (M-1,), }
|
|
|
|
|
|
|
|
def _test(self, uebdict):
|
|
|
|
uebstring = uri.pack_extension(uebdict)
|
|
|
|
uebhash = hashutil.uri_extension_hash(uebstring)
|
|
|
|
fb = FakeBucketReaderWriterProxy()
|
|
|
|
fb.put_uri_extension(uebstring)
|
|
|
|
verifycap = uri.CHKFileVerifierURI(storage_index='x'*16, uri_extension_hash=uebhash, needed_shares=self.K, total_shares=self.M, size=self.SIZE)
|
|
|
|
vup = download.ValidatedExtendedURIProxy(fb, verifycap)
|
|
|
|
return vup.start()
|
|
|
|
|
|
|
|
def _test_accept(self, uebdict):
|
|
|
|
return self._test(uebdict)
|
|
|
|
|
|
|
|
def _should_fail(self, res, expected_failures):
|
|
|
|
if isinstance(res, Failure):
|
|
|
|
res.trap(*expected_failures)
|
|
|
|
else:
|
|
|
|
self.fail("was supposed to raise %s, not get '%s'" % (expected_failures, res))
|
|
|
|
|
|
|
|
def _test_reject(self, uebdict):
|
|
|
|
d = self._test(uebdict)
|
|
|
|
d.addBoth(self._should_fail, (KeyError, download.BadURIExtension))
|
|
|
|
return d
|
|
|
|
|
|
|
|
def test_accept_minimal(self):
|
|
|
|
return self._test_accept(self.mindict)
|
|
|
|
|
|
|
|
def test_reject_insufficient(self):
|
|
|
|
dl = []
|
|
|
|
for k in self.mindict.iterkeys():
|
|
|
|
insuffdict = self.mindict.copy()
|
|
|
|
del insuffdict[k]
|
|
|
|
d = self._test_reject(insuffdict)
|
|
|
|
dl.append(d)
|
|
|
|
return defer.DeferredList(dl)
|
|
|
|
|
|
|
|
def test_accept_optional(self):
|
|
|
|
dl = []
|
|
|
|
for k in self.optional_consistent.iterkeys():
|
|
|
|
mydict = self.mindict.copy()
|
|
|
|
mydict[k] = self.optional_consistent[k]
|
|
|
|
d = self._test_accept(mydict)
|
|
|
|
dl.append(d)
|
|
|
|
return defer.DeferredList(dl)
|
|
|
|
|
|
|
|
def test_reject_optional(self):
|
|
|
|
dl = []
|
|
|
|
for k in self.optional_inconsistent.iterkeys():
|
|
|
|
for v in self.optional_inconsistent[k]:
|
|
|
|
mydict = self.mindict.copy()
|
|
|
|
mydict[k] = v
|
|
|
|
d = self._test_reject(mydict)
|
|
|
|
dl.append(d)
|
|
|
|
return defer.DeferredList(dl)
|
|
|
|
|
2007-03-30 23:50:50 +00:00
|
|
|
class Encode(unittest.TestCase):
|
2007-04-17 19:29:56 +00:00
|
|
|
|
|
|
|
def do_encode(self, max_segment_size, datalen, NUM_SHARES, NUM_SEGMENTS,
|
|
|
|
expected_block_hashes, expected_share_hashes):
|
|
|
|
data = make_data(datalen)
|
|
|
|
# force use of multiple segments
|
2008-01-16 10:03:35 +00:00
|
|
|
e = encode.Encoder()
|
2008-03-24 16:46:06 +00:00
|
|
|
u = upload.Data(data, convergence="some convergence string")
|
2008-02-07 01:39:03 +00:00
|
|
|
u.max_segment_size = max_segment_size
|
|
|
|
u.encoding_param_k = 25
|
|
|
|
u.encoding_param_happy = 75
|
|
|
|
u.encoding_param_n = 100
|
|
|
|
eu = upload.EncryptAnUploadable(u)
|
2007-07-24 02:31:53 +00:00
|
|
|
d = e.set_encrypted_uploadable(eu)
|
|
|
|
|
2007-03-30 18:32:57 +00:00
|
|
|
all_shareholders = []
|
2007-07-24 02:31:53 +00:00
|
|
|
def _ready(res):
|
|
|
|
k,happy,n = e.get_param("share_counts")
|
|
|
|
_assert(n == NUM_SHARES) # else we'll be completely confused
|
|
|
|
numsegs = e.get_param("num_segments")
|
|
|
|
_assert(numsegs == NUM_SEGMENTS, numsegs, NUM_SEGMENTS)
|
|
|
|
segsize = e.get_param("segment_size")
|
|
|
|
_assert( (NUM_SEGMENTS-1)*segsize < len(data) <= NUM_SEGMENTS*segsize,
|
|
|
|
NUM_SEGMENTS, segsize,
|
|
|
|
(NUM_SEGMENTS-1)*segsize, len(data), NUM_SEGMENTS*segsize)
|
|
|
|
|
|
|
|
shareholders = {}
|
|
|
|
for shnum in range(NUM_SHARES):
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
peer = FakeBucketReaderWriterProxy()
|
2007-07-24 02:31:53 +00:00
|
|
|
shareholders[shnum] = peer
|
|
|
|
all_shareholders.append(peer)
|
|
|
|
e.set_shareholders(shareholders)
|
|
|
|
return e.start()
|
|
|
|
d.addCallback(_ready)
|
|
|
|
|
|
|
|
def _check(res):
|
2009-01-07 04:48:22 +00:00
|
|
|
verifycap = res
|
|
|
|
self.failUnless(isinstance(verifycap.uri_extension_hash, str))
|
|
|
|
self.failUnlessEqual(len(verifycap.uri_extension_hash), 32)
|
2007-03-30 18:32:57 +00:00
|
|
|
for i,peer in enumerate(all_shareholders):
|
|
|
|
self.failUnless(peer.closed)
|
|
|
|
self.failUnlessEqual(len(peer.blocks), NUM_SEGMENTS)
|
2007-04-17 19:29:56 +00:00
|
|
|
# each peer gets a full tree of block hashes. For 3 or 4
|
|
|
|
# segments, that's 7 hashes. For 5 segments it's 15 hashes.
|
|
|
|
self.failUnlessEqual(len(peer.block_hashes),
|
|
|
|
expected_block_hashes)
|
2007-03-30 20:20:01 +00:00
|
|
|
for h in peer.block_hashes:
|
2007-03-30 18:32:57 +00:00
|
|
|
self.failUnlessEqual(len(h), 32)
|
2007-04-17 19:29:56 +00:00
|
|
|
# each peer also gets their necessary chain of share hashes.
|
|
|
|
# For 100 shares (rounded up to 128 leaves), that's 8 hashes
|
|
|
|
self.failUnlessEqual(len(peer.share_hashes),
|
|
|
|
expected_share_hashes)
|
2007-03-30 20:20:01 +00:00
|
|
|
for (hashnum, h) in peer.share_hashes:
|
2007-03-30 18:32:57 +00:00
|
|
|
self.failUnless(isinstance(hashnum, int))
|
|
|
|
self.failUnlessEqual(len(h), 32)
|
|
|
|
d.addCallback(_check)
|
|
|
|
|
|
|
|
return d
|
2007-03-30 20:20:01 +00:00
|
|
|
|
2007-04-17 19:29:56 +00:00
|
|
|
# a series of 3*3 tests to check out edge conditions. One axis is how the
|
|
|
|
# plaintext is divided into segments: kn+(-1,0,1). Another way to express
|
|
|
|
# that is that n%k == -1 or 0 or 1. For example, for 25-byte segments, we
|
|
|
|
# might test 74 bytes, 75 bytes, and 76 bytes.
|
|
|
|
|
|
|
|
# on the other axis is how many leaves in the block hash tree we wind up
|
|
|
|
# with, relative to a power of 2, so 2^a+(-1,0,1). Each segment turns
|
|
|
|
# into a single leaf. So we'd like to check out, e.g., 3 segments, 4
|
|
|
|
# segments, and 5 segments.
|
|
|
|
|
|
|
|
# that results in the following series of data lengths:
|
|
|
|
# 3 segs: 74, 75, 51
|
|
|
|
# 4 segs: 99, 100, 76
|
|
|
|
# 5 segs: 124, 125, 101
|
|
|
|
|
|
|
|
# all tests encode to 100 shares, which means the share hash tree will
|
|
|
|
# have 128 leaves, which means that buckets will be given an 8-long share
|
|
|
|
# hash chain
|
2007-11-01 22:25:00 +00:00
|
|
|
|
2007-04-17 19:29:56 +00:00
|
|
|
# all 3-segment files will have a 4-leaf blockhashtree, and thus expect
|
|
|
|
# to get 7 blockhashes. 4-segment files will also get 4-leaf block hash
|
|
|
|
# trees and 7 blockhashes. 5-segment files will get 8-leaf block hash
|
|
|
|
# trees, which get 15 blockhashes.
|
|
|
|
|
|
|
|
def test_send_74(self):
|
|
|
|
# 3 segments (25, 25, 24)
|
|
|
|
return self.do_encode(25, 74, 100, 3, 7, 8)
|
|
|
|
def test_send_75(self):
|
|
|
|
# 3 segments (25, 25, 25)
|
|
|
|
return self.do_encode(25, 75, 100, 3, 7, 8)
|
|
|
|
def test_send_51(self):
|
|
|
|
# 3 segments (25, 25, 1)
|
|
|
|
return self.do_encode(25, 51, 100, 3, 7, 8)
|
|
|
|
|
|
|
|
def test_send_76(self):
|
|
|
|
# encode a 76 byte file (in 4 segments: 25,25,25,1) to 100 shares
|
|
|
|
return self.do_encode(25, 76, 100, 4, 7, 8)
|
|
|
|
def test_send_99(self):
|
|
|
|
# 4 segments: 25,25,25,24
|
|
|
|
return self.do_encode(25, 99, 100, 4, 7, 8)
|
|
|
|
def test_send_100(self):
|
|
|
|
# 4 segments: 25,25,25,25
|
|
|
|
return self.do_encode(25, 100, 100, 4, 7, 8)
|
|
|
|
|
|
|
|
def test_send_124(self):
|
|
|
|
# 5 segments: 25, 25, 25, 25, 24
|
|
|
|
return self.do_encode(25, 124, 100, 5, 15, 8)
|
|
|
|
def test_send_125(self):
|
|
|
|
# 5 segments: 25, 25, 25, 25, 25
|
|
|
|
return self.do_encode(25, 125, 100, 5, 15, 8)
|
|
|
|
def test_send_101(self):
|
|
|
|
# 5 segments: 25, 25, 25, 25, 1
|
|
|
|
return self.do_encode(25, 101, 100, 5, 15, 8)
|
|
|
|
|
2008-07-14 22:25:21 +00:00
|
|
|
class PausingTarget(download.Data):
|
|
|
|
implements(IConsumer)
|
|
|
|
def __init__(self):
|
|
|
|
download.Data.__init__(self)
|
|
|
|
self.size = 0
|
|
|
|
self.writes = 0
|
|
|
|
def write(self, data):
|
|
|
|
self.size += len(data)
|
|
|
|
self.writes += 1
|
|
|
|
if self.writes <= 2:
|
|
|
|
# we happen to use 4 segments, and want to avoid pausing on the
|
|
|
|
# last one (since then the _unpause timer will still be running)
|
|
|
|
self.producer.pauseProducing()
|
|
|
|
reactor.callLater(0.1, self._unpause)
|
|
|
|
return download.Data.write(self, data)
|
|
|
|
def _unpause(self):
|
|
|
|
self.producer.resumeProducing()
|
|
|
|
def registerProducer(self, producer, streaming):
|
|
|
|
self.producer = producer
|
|
|
|
def unregisterProducer(self):
|
|
|
|
self.producer = None
|
|
|
|
|
|
|
|
class PausingAndStoppingTarget(PausingTarget):
|
|
|
|
def write(self, data):
|
|
|
|
self.producer.pauseProducing()
|
|
|
|
reactor.callLater(0.5, self._stop)
|
|
|
|
def _stop(self):
|
|
|
|
self.producer.stopProducing()
|
|
|
|
|
|
|
|
class StoppingTarget(PausingTarget):
|
|
|
|
def write(self, data):
|
|
|
|
self.producer.stopProducing()
|
|
|
|
|
|
|
|
class Roundtrip(unittest.TestCase, testutil.ShouldFailMixin):
|
2007-06-06 17:32:40 +00:00
|
|
|
def send_and_recover(self, k_and_happy_and_n=(25,75,100),
|
2007-04-17 00:21:37 +00:00
|
|
|
AVAILABLE_SHARES=None,
|
2007-04-17 19:57:55 +00:00
|
|
|
datalen=76,
|
|
|
|
max_segment_size=25,
|
2007-06-06 17:32:40 +00:00
|
|
|
bucket_modes={},
|
2007-06-08 02:32:29 +00:00
|
|
|
recover_mode="recover",
|
2008-07-14 22:25:21 +00:00
|
|
|
target=None,
|
2007-06-06 17:32:40 +00:00
|
|
|
):
|
2007-06-08 01:36:25 +00:00
|
|
|
if AVAILABLE_SHARES is None:
|
|
|
|
AVAILABLE_SHARES = k_and_happy_and_n[2]
|
|
|
|
data = make_data(datalen)
|
|
|
|
d = self.send(k_and_happy_and_n, AVAILABLE_SHARES,
|
|
|
|
max_segment_size, bucket_modes, data)
|
2007-06-08 22:59:16 +00:00
|
|
|
# that fires with (uri_extension_hash, e, shareholders)
|
2008-07-14 22:25:21 +00:00
|
|
|
d.addCallback(self.recover, AVAILABLE_SHARES, recover_mode,
|
|
|
|
target=target)
|
2007-06-08 01:36:25 +00:00
|
|
|
# that fires with newdata
|
2007-06-08 02:32:29 +00:00
|
|
|
def _downloaded((newdata, fd)):
|
2009-01-08 18:20:48 +00:00
|
|
|
self.failUnless(newdata == data, str((len(newdata), len(data))))
|
2007-06-08 02:32:29 +00:00
|
|
|
return fd
|
2007-06-08 01:36:25 +00:00
|
|
|
d.addCallback(_downloaded)
|
|
|
|
return d
|
|
|
|
|
|
|
|
def send(self, k_and_happy_and_n, AVAILABLE_SHARES, max_segment_size,
|
|
|
|
bucket_modes, data):
|
2008-01-16 10:03:35 +00:00
|
|
|
k, happy, n = k_and_happy_and_n
|
2007-06-06 17:32:40 +00:00
|
|
|
NUM_SHARES = k_and_happy_and_n[2]
|
2007-04-17 00:21:37 +00:00
|
|
|
if AVAILABLE_SHARES is None:
|
|
|
|
AVAILABLE_SHARES = NUM_SHARES
|
2008-01-16 10:03:35 +00:00
|
|
|
e = encode.Encoder()
|
2008-03-24 16:46:06 +00:00
|
|
|
u = upload.Data(data, convergence="some convergence string")
|
2008-01-16 10:03:35 +00:00
|
|
|
# force use of multiple segments by using a low max_segment_size
|
2008-02-07 01:39:03 +00:00
|
|
|
u.max_segment_size = max_segment_size
|
|
|
|
u.encoding_param_k = k
|
|
|
|
u.encoding_param_happy = happy
|
|
|
|
u.encoding_param_n = n
|
|
|
|
eu = upload.EncryptAnUploadable(u)
|
2007-07-24 02:31:53 +00:00
|
|
|
d = e.set_encrypted_uploadable(eu)
|
2007-03-30 23:50:50 +00:00
|
|
|
|
2007-03-30 20:20:01 +00:00
|
|
|
shareholders = {}
|
2007-07-24 02:31:53 +00:00
|
|
|
def _ready(res):
|
|
|
|
k,happy,n = e.get_param("share_counts")
|
|
|
|
assert n == NUM_SHARES # else we'll be completely confused
|
|
|
|
all_peers = []
|
|
|
|
for shnum in range(NUM_SHARES):
|
|
|
|
mode = bucket_modes.get(shnum, "good")
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
peer = FakeBucketReaderWriterProxy(mode)
|
2007-07-24 02:31:53 +00:00
|
|
|
shareholders[shnum] = peer
|
|
|
|
e.set_shareholders(shareholders)
|
|
|
|
return e.start()
|
|
|
|
d.addCallback(_ready)
|
|
|
|
def _sent(res):
|
|
|
|
d1 = u.get_encryption_key()
|
|
|
|
d1.addCallback(lambda key: (res, key, shareholders))
|
|
|
|
return d1
|
2007-06-08 01:36:25 +00:00
|
|
|
d.addCallback(_sent)
|
2007-06-08 01:24:26 +00:00
|
|
|
return d
|
2007-03-30 20:20:01 +00:00
|
|
|
|
2007-07-24 02:31:53 +00:00
|
|
|
def recover(self, (res, key, shareholders), AVAILABLE_SHARES,
|
2008-07-14 22:25:21 +00:00
|
|
|
recover_mode, target=None):
|
2009-01-07 04:48:22 +00:00
|
|
|
verifycap = res
|
2007-07-22 01:23:15 +00:00
|
|
|
|
2007-06-08 04:24:39 +00:00
|
|
|
if "corrupt_key" in recover_mode:
|
2007-07-22 01:23:15 +00:00
|
|
|
# we corrupt the key, so that the decrypted data is corrupted and
|
|
|
|
# will fail the plaintext hash check. Since we're manually
|
|
|
|
# attaching shareholders, the fact that the storage index is also
|
|
|
|
# corrupted doesn't matter.
|
2007-07-24 02:31:53 +00:00
|
|
|
key = flip_bit(key)
|
2007-07-22 01:23:15 +00:00
|
|
|
|
|
|
|
u = uri.CHKFileURI(key=key,
|
2009-01-07 04:48:22 +00:00
|
|
|
uri_extension_hash=verifycap.uri_extension_hash,
|
|
|
|
needed_shares=verifycap.needed_shares,
|
|
|
|
total_shares=verifycap.total_shares,
|
|
|
|
size=verifycap.size)
|
2007-07-22 01:23:15 +00:00
|
|
|
|
2007-11-20 02:07:10 +00:00
|
|
|
client = FakeClient()
|
2008-07-14 22:25:21 +00:00
|
|
|
if not target:
|
|
|
|
target = download.Data()
|
2009-01-08 18:53:49 +00:00
|
|
|
target = download.DecryptingTarget(target, u.key)
|
2009-01-08 21:42:15 +00:00
|
|
|
fd = download.CiphertextDownloader(client, u.get_verify_cap(), target, monitor=Monitor())
|
2007-06-08 02:32:29 +00:00
|
|
|
|
2009-01-08 19:13:07 +00:00
|
|
|
# we manually cycle the CiphertextDownloader through a number of steps that
|
2007-06-08 02:32:29 +00:00
|
|
|
# would normally be sequenced by a Deferred chain in
|
2009-01-08 19:13:07 +00:00
|
|
|
# CiphertextDownloader.start(), to give us more control over the process.
|
2007-06-08 02:32:29 +00:00
|
|
|
# In particular, by bypassing _get_all_shareholders, we skip
|
|
|
|
# permuted-peerlist selection.
|
2007-06-08 01:24:26 +00:00
|
|
|
for shnum, bucket in shareholders.items():
|
|
|
|
if shnum < AVAILABLE_SHARES and bucket.closed:
|
|
|
|
fd.add_share_bucket(shnum, bucket)
|
|
|
|
fd._got_all_shareholders(None)
|
2007-06-08 02:32:29 +00:00
|
|
|
|
2007-06-08 22:59:16 +00:00
|
|
|
# Make it possible to obtain uri_extension from the shareholders.
|
|
|
|
# Arrange for shareholders[0] to be the first, so we can selectively
|
|
|
|
# corrupt the data it returns.
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
uri_extension_sources = shareholders.values()
|
|
|
|
uri_extension_sources.remove(shareholders[0])
|
|
|
|
uri_extension_sources.insert(0, shareholders[0])
|
2007-06-08 02:32:29 +00:00
|
|
|
|
2007-06-08 04:24:39 +00:00
|
|
|
d = defer.succeed(None)
|
|
|
|
|
2009-01-08 19:13:07 +00:00
|
|
|
# have the CiphertextDownloader retrieve a copy of uri_extension itself
|
2007-06-08 22:59:16 +00:00
|
|
|
d.addCallback(fd._obtain_uri_extension)
|
2007-06-08 04:24:39 +00:00
|
|
|
|
|
|
|
if "corrupt_crypttext_hashes" in recover_mode:
|
|
|
|
# replace everybody's crypttext hash trees with a different one
|
2007-06-08 22:59:16 +00:00
|
|
|
# (computed over a different file), then modify our uri_extension
|
|
|
|
# to reflect the new crypttext hash tree root
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
def _corrupt_crypttext_hashes(unused):
|
|
|
|
assert isinstance(fd._vup, download.ValidatedExtendedURIProxy), fd._vup
|
|
|
|
assert fd._vup.crypttext_root_hash, fd._vup
|
2007-06-08 04:24:39 +00:00
|
|
|
badhash = hashutil.tagged_hash("bogus", "data")
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
bad_crypttext_hashes = [badhash] * fd._vup.num_segments
|
2007-06-08 04:24:39 +00:00
|
|
|
badtree = hashtree.HashTree(bad_crypttext_hashes)
|
|
|
|
for bucket in shareholders.values():
|
|
|
|
bucket.crypttext_hashes = list(badtree)
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
fd._crypttext_hash_tree = hashtree.IncompleteHashTree(fd._vup.num_segments)
|
|
|
|
fd._crypttext_hash_tree.set_hashes({0: badtree[0]})
|
|
|
|
return fd._vup
|
2007-06-08 04:24:39 +00:00
|
|
|
d.addCallback(_corrupt_crypttext_hashes)
|
2007-06-08 02:32:29 +00:00
|
|
|
|
2009-01-08 19:13:07 +00:00
|
|
|
# also have the CiphertextDownloader ask for hash trees
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
d.addCallback(fd._get_crypttext_hash_tree)
|
2007-06-08 04:24:39 +00:00
|
|
|
|
2007-06-08 02:32:29 +00:00
|
|
|
d.addCallback(fd._download_all_segments)
|
2007-06-08 01:24:26 +00:00
|
|
|
d.addCallback(fd._done)
|
2007-06-08 02:32:29 +00:00
|
|
|
def _done(newdata):
|
|
|
|
return (newdata, fd)
|
|
|
|
d.addCallback(_done)
|
2007-03-30 20:20:01 +00:00
|
|
|
return d
|
2007-03-30 23:50:50 +00:00
|
|
|
|
2007-04-17 00:21:37 +00:00
|
|
|
def test_not_enough_shares(self):
|
2007-06-06 17:32:40 +00:00
|
|
|
d = self.send_and_recover((4,8,10), AVAILABLE_SHARES=2)
|
2007-04-17 00:21:37 +00:00
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
2008-10-27 20:34:49 +00:00
|
|
|
self.failUnless(res.check(NotEnoughSharesError))
|
2007-04-17 00:21:37 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
2007-03-30 23:50:50 +00:00
|
|
|
def test_one_share_per_peer(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
return self.send_and_recover()
|
2007-04-16 23:30:21 +00:00
|
|
|
|
2007-04-17 19:57:55 +00:00
|
|
|
def test_74(self):
|
|
|
|
return self.send_and_recover(datalen=74)
|
|
|
|
def test_75(self):
|
|
|
|
return self.send_and_recover(datalen=75)
|
|
|
|
def test_51(self):
|
|
|
|
return self.send_and_recover(datalen=51)
|
|
|
|
|
|
|
|
def test_99(self):
|
|
|
|
return self.send_and_recover(datalen=99)
|
|
|
|
def test_100(self):
|
|
|
|
return self.send_and_recover(datalen=100)
|
|
|
|
def test_76(self):
|
|
|
|
return self.send_and_recover(datalen=76)
|
|
|
|
|
|
|
|
def test_124(self):
|
|
|
|
return self.send_and_recover(datalen=124)
|
|
|
|
def test_125(self):
|
|
|
|
return self.send_and_recover(datalen=125)
|
|
|
|
def test_101(self):
|
|
|
|
return self.send_and_recover(datalen=101)
|
2007-04-17 02:55:03 +00:00
|
|
|
|
2008-07-14 22:25:21 +00:00
|
|
|
def test_pause(self):
|
|
|
|
# use a DownloadTarget that does pauseProducing/resumeProducing a few
|
|
|
|
# times, then finishes
|
|
|
|
t = PausingTarget()
|
|
|
|
d = self.send_and_recover(target=t)
|
|
|
|
return d
|
|
|
|
|
|
|
|
def test_pause_then_stop(self):
|
|
|
|
# use a DownloadTarget that pauses, then stops.
|
|
|
|
t = PausingAndStoppingTarget()
|
|
|
|
d = self.shouldFail(download.DownloadStopped, "test_pause_then_stop",
|
|
|
|
"our Consumer called stopProducing()",
|
|
|
|
self.send_and_recover, target=t)
|
|
|
|
return d
|
|
|
|
|
|
|
|
def test_stop(self):
|
|
|
|
# use a DownloadTarget that does an immediate stop (ticket #473)
|
|
|
|
t = StoppingTarget()
|
|
|
|
d = self.shouldFail(download.DownloadStopped, "test_stop",
|
|
|
|
"our Consumer called stopProducing()",
|
|
|
|
self.send_and_recover, target=t)
|
|
|
|
return d
|
|
|
|
|
2007-04-19 17:56:15 +00:00
|
|
|
# the following tests all use 4-out-of-10 encoding
|
|
|
|
|
2007-04-16 23:30:21 +00:00
|
|
|
def test_bad_blocks(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
# the first 6 servers have bad blocks, which will be caught by the
|
2007-04-16 23:30:21 +00:00
|
|
|
# blockhashes
|
|
|
|
modemap = dict([(i, "bad block")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6)]
|
2007-04-16 23:30:21 +00:00
|
|
|
+ [(i, "good")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6, 10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
return self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-16 23:30:21 +00:00
|
|
|
|
|
|
|
def test_bad_blocks_failure(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
# the first 7 servers have bad blocks, which will be caught by the
|
2007-04-16 23:30:21 +00:00
|
|
|
# blockhashes, and the download will fail
|
|
|
|
modemap = dict([(i, "bad block")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(7)]
|
2007-04-16 23:30:21 +00:00
|
|
|
+ [(i, "good")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(7, 10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-16 23:30:21 +00:00
|
|
|
def _done(res):
|
2008-12-21 22:07:52 +00:00
|
|
|
self.failUnless(isinstance(res, Failure), res)
|
|
|
|
self.failUnless(res.check(NotEnoughSharesError), res)
|
2007-04-16 23:30:21 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
|
|
|
def test_bad_blockhashes(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
# the first 6 servers have bad block hashes, so the blockhash tree
|
2007-04-16 23:30:21 +00:00
|
|
|
# will not validate
|
|
|
|
modemap = dict([(i, "bad blockhash")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6)]
|
2007-04-16 23:30:21 +00:00
|
|
|
+ [(i, "good")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6, 10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
return self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-16 23:30:21 +00:00
|
|
|
|
|
|
|
def test_bad_blockhashes_failure(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
# the first 7 servers have bad block hashes, so the blockhash tree
|
2007-04-16 23:30:21 +00:00
|
|
|
# will not validate, and the download will fail
|
|
|
|
modemap = dict([(i, "bad blockhash")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(7)]
|
2007-04-16 23:30:21 +00:00
|
|
|
+ [(i, "good")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(7, 10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-16 23:30:21 +00:00
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
self.failUnless(res.check(NotEnoughSharesError), res)
|
2007-04-16 23:30:21 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
|
|
|
def test_bad_sharehashes(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
# the first 6 servers have bad block hashes, so the sharehash tree
|
2007-04-16 23:30:21 +00:00
|
|
|
# will not validate
|
|
|
|
modemap = dict([(i, "bad sharehash")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6)]
|
2007-04-16 23:30:21 +00:00
|
|
|
+ [(i, "good")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6, 10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
return self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-16 23:30:21 +00:00
|
|
|
|
2007-06-08 02:32:29 +00:00
|
|
|
def assertFetchFailureIn(self, fd, where):
|
2007-06-08 22:59:16 +00:00
|
|
|
expected = {"uri_extension": 0,
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
"crypttext_hash_tree": 0,
|
2007-06-08 02:32:29 +00:00
|
|
|
}
|
|
|
|
if where is not None:
|
|
|
|
expected[where] += 1
|
|
|
|
self.failUnlessEqual(fd._fetch_failures, expected)
|
|
|
|
|
2007-06-08 04:24:39 +00:00
|
|
|
def test_good(self):
|
|
|
|
# just to make sure the test harness works when we aren't
|
|
|
|
# intentionally causing failures
|
|
|
|
modemap = dict([(i, "good") for i in range(0, 10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-06-08 02:32:29 +00:00
|
|
|
d.addCallback(self.assertFetchFailureIn, None)
|
|
|
|
return d
|
|
|
|
|
2007-06-08 22:59:16 +00:00
|
|
|
def test_bad_uri_extension(self):
|
|
|
|
# the first server has a bad uri_extension block, so we will fail
|
|
|
|
# over to a different server.
|
|
|
|
modemap = dict([(i, "bad uri_extension") for i in range(1)] +
|
2007-06-08 02:32:29 +00:00
|
|
|
[(i, "good") for i in range(1, 10)])
|
2007-06-08 04:24:39 +00:00
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-06-08 22:59:16 +00:00
|
|
|
d.addCallback(self.assertFetchFailureIn, "uri_extension")
|
2007-06-08 02:32:29 +00:00
|
|
|
return d
|
|
|
|
|
2008-03-23 22:35:54 +00:00
|
|
|
def test_bad_crypttext_hashroot(self):
|
|
|
|
# the first server has a bad crypttext hashroot, so we will fail
|
|
|
|
# over to a different server.
|
|
|
|
modemap = dict([(i, "bad crypttext hashroot") for i in range(1)] +
|
|
|
|
[(i, "good") for i in range(1, 10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
d.addCallback(self.assertFetchFailureIn, "crypttext_hash_tree")
|
2008-03-23 22:35:54 +00:00
|
|
|
return d
|
|
|
|
|
|
|
|
def test_bad_crypttext_hashes(self):
|
|
|
|
# the first server has a bad crypttext hash block, so we will fail
|
|
|
|
# over to a different server.
|
|
|
|
modemap = dict([(i, "bad crypttext hash") for i in range(1)] +
|
|
|
|
[(i, "good") for i in range(1, 10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
download: refactor handling of URI Extension Block and crypttext hash tree, simplify things
Refactor into a class the logic of asking each server in turn until one of them gives an answer
that validates. It is called ValidatedThingObtainer.
Refactor the downloading and verification of the URI Extension Block into a class named
ValidatedExtendedURIProxy.
The new logic of validating UEBs is minimalist: it doesn't require the UEB to contain any
unncessary information, but of course it still accepts such information for backwards
compatibility (so that this new download code is able to download files uploaded with old, and
for that matter with current, upload code).
The new logic of validating UEBs follows the practice of doing all validation up front. This
practice advises one to isolate the validation of incoming data into one place, so that all of
the rest of the code can assume only valid data.
If any redundant information is present in the UEB+URI, the new code cross-checks and asserts
that it is all fully consistent. This closes some issues where the uploader could have
uploaded inconsistent redundant data, which would probably have caused the old downloader to
simply reject that download after getting a Python exception, but perhaps could have caused
greater harm to the old downloader.
I removed the notion of selecting an erasure codec from codec.py based on the string that was
passed in the UEB. Currently "crs" is the only such string that works, so
"_assert(codec_name == 'crs')" is simpler and more explicit. This is also in keeping with the
"validate up front" strategy -- now if someone sets a different string than "crs" in their UEB,
the downloader will reject the download in the "validate this UEB" function instead of in a
separate "select the codec instance" function.
I removed the code to check plaintext hashes and plaintext Merkle Trees. Uploaders do not
produce this information any more (since it potentially exposes confidential information about
the file), and the unit tests for it were disabled. The downloader before this patch would
check that plaintext hash or plaintext merkle tree if they were present, but not complain if
they were absent. The new downloader in this patch complains if they are present and doesn't
check them. (We might in the future re-introduce such hashes over the plaintext, but encrypt
the hashes which are stored in the UEB to preserve confidentiality. This would be a double-
check on the correctness of our own source code -- the current Merkle Tree over the ciphertext
is already sufficient to guarantee the integrity of the download unless there is a bug in our
Merkle Tree or AES implementation.)
This patch increases the lines-of-code count by 8 (from 17,770 to 17,778), and reduces the
uncovered-by-tests lines-of-code count by 24 (from 1408 to 1384). Those numbers would be more
meaningful if we omitted src/allmydata/util/ from the test-coverage statistics.
2008-12-05 15:17:54 +00:00
|
|
|
d.addCallback(self.assertFetchFailureIn, "crypttext_hash_tree")
|
2008-03-23 22:35:54 +00:00
|
|
|
return d
|
|
|
|
|
|
|
|
def test_bad_crypttext_hashes_failure(self):
|
|
|
|
# to test that the crypttext merkle tree is really being applied, we
|
|
|
|
# sneak into the download process and corrupt two things: we replace
|
|
|
|
# everybody's crypttext hashtree with a bad version (computed over
|
|
|
|
# bogus data), and we modify the supposedly-validated uri_extension
|
|
|
|
# block to match the new crypttext hashtree root. The download
|
|
|
|
# process should notice that the crypttext coming out of FEC doesn't
|
|
|
|
# match the tree, and fail.
|
|
|
|
|
|
|
|
modemap = dict([(i, "good") for i in range(0, 10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap,
|
|
|
|
recover_mode=("corrupt_crypttext_hashes"))
|
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
|
|
|
self.failUnless(res.check(hashtree.BadHashError), res)
|
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
2008-03-24 20:39:51 +00:00
|
|
|
def OFF_test_bad_plaintext(self):
|
2008-03-23 22:35:54 +00:00
|
|
|
# faking a decryption failure is easier: just corrupt the key
|
|
|
|
modemap = dict([(i, "good") for i in range(0, 10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap,
|
|
|
|
recover_mode=("corrupt_key"))
|
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
|
|
|
self.failUnless(res.check(hashtree.BadHashError), res)
|
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
2007-04-16 23:30:21 +00:00
|
|
|
def test_bad_sharehashes_failure(self):
|
2009-01-06 00:53:42 +00:00
|
|
|
# all ten servers have bad share hashes, so the sharehash tree
|
2007-04-16 23:30:21 +00:00
|
|
|
# will not validate, and the download will fail
|
|
|
|
modemap = dict([(i, "bad sharehash")
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
for i in range(10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-16 23:30:21 +00:00
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
2008-10-27 20:34:49 +00:00
|
|
|
self.failUnless(res.check(NotEnoughSharesError))
|
2007-04-16 23:30:21 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
2007-03-30 23:50:50 +00:00
|
|
|
|
2007-04-17 00:15:44 +00:00
|
|
|
def test_missing_sharehashes(self):
|
2007-04-19 17:56:15 +00:00
|
|
|
# the first 6 servers are missing their sharehashes, so the
|
2007-04-17 00:15:44 +00:00
|
|
|
# sharehash tree will not validate
|
|
|
|
modemap = dict([(i, "missing sharehash")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6)]
|
2007-04-17 00:15:44 +00:00
|
|
|
+ [(i, "good")
|
2007-04-19 17:56:15 +00:00
|
|
|
for i in range(6, 10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
return self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-17 00:15:44 +00:00
|
|
|
|
|
|
|
def test_missing_sharehashes_failure(self):
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
# all servers are missing their sharehashes, so the sharehash tree will not validate,
|
|
|
|
# and the download will fail
|
2007-04-17 00:15:44 +00:00
|
|
|
modemap = dict([(i, "missing sharehash")
|
immutable: refactor downloader to be more reusable for checker/verifier/repairer (and better)
The code for validating the share hash tree and the block hash tree has been rewritten to make sure it handles all cases, to share metadata about the file (such as the share hash tree, block hash trees, and UEB) among different share downloads, and not to require hashes to be stored on the server unnecessarily, such as the roots of the block hash trees (not needed since they are also the leaves of the share hash tree), and the root of the share hash tree (not needed since it is also included in the UEB). It also passes the latest tests including handling corrupted shares well.
ValidatedReadBucketProxy takes a share_hash_tree argument to its constructor, which is a reference to a share hash tree shared by all ValidatedReadBucketProxies for that immutable file download.
ValidatedReadBucketProxy requires the block_size and share_size to be provided in its constructor, and it then uses those to compute the offsets and lengths of blocks when it needs them, instead of reading those values out of the share. The user of ValidatedReadBucketProxy therefore has to have first used a ValidatedExtendedURIProxy to compute those two values from the validated contents of the URI. This is pleasingly simplifies safety analysis: the client knows which span of bytes corresponds to a given block from the validated URI data, rather than from the unvalidated data stored on the storage server. It also simplifies unit testing of verifier/repairer, because now it doesn't care about the contents of the "share size" and "block size" fields in the share. It does not relieve the need for share data v2 layout, because we still need to store and retrieve the offsets of the fields which come after the share data, therefore we still need to use share data v2 with its 8-byte fields if we want to store share data larger than about 2^32.
Specify which subset of the block hashes and share hashes you need while downloading a particular share. In the future this will hopefully be used to fetch only a subset, for network efficiency, but currently all of them are fetched, regardless of which subset you specify.
ReadBucketProxy hides the question of whether it has "started" or not (sent a request to the server to get metadata) from its user.
Download is optimized to do as few roundtrips and as few requests as possible, hopefully speeding up download a bit.
2009-01-05 16:51:45 +00:00
|
|
|
for i in range(10)])
|
2007-06-06 17:32:40 +00:00
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
2007-04-17 00:15:44 +00:00
|
|
|
def _done(res):
|
2008-12-21 22:07:52 +00:00
|
|
|
self.failUnless(isinstance(res, Failure), res)
|
|
|
|
self.failUnless(res.check(NotEnoughSharesError), res)
|
2007-04-17 00:15:44 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
2007-06-06 17:32:40 +00:00
|
|
|
def test_lost_one_shareholder(self):
|
|
|
|
# we have enough shareholders when we start, but one segment in we
|
|
|
|
# lose one of them. The upload should still succeed, as long as we
|
|
|
|
# still have 'shares_of_happiness' peers left.
|
|
|
|
modemap = dict([(i, "good") for i in range(9)] +
|
|
|
|
[(i, "lost") for i in range(9, 10)])
|
|
|
|
return self.send_and_recover((4,8,10), bucket_modes=modemap)
|
|
|
|
|
2008-01-28 19:14:48 +00:00
|
|
|
def test_lost_one_shareholder_early(self):
|
|
|
|
# we have enough shareholders when we choose peers, but just before
|
|
|
|
# we send the 'start' message, we lose one of them. The upload should
|
|
|
|
# still succeed, as long as we still have 'shares_of_happiness' peers
|
|
|
|
# left.
|
|
|
|
modemap = dict([(i, "good") for i in range(9)] +
|
|
|
|
[(i, "lost-early") for i in range(9, 10)])
|
|
|
|
return self.send_and_recover((4,8,10), bucket_modes=modemap)
|
|
|
|
|
2007-06-06 17:32:40 +00:00
|
|
|
def test_lost_many_shareholders(self):
|
|
|
|
# we have enough shareholders when we start, but one segment in we
|
|
|
|
# lose all but one of them. The upload should fail.
|
|
|
|
modemap = dict([(i, "good") for i in range(1)] +
|
|
|
|
[(i, "lost") for i in range(1, 10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
2008-10-27 20:34:49 +00:00
|
|
|
self.failUnless(res.check(NotEnoughSharesError), res)
|
2007-06-06 17:32:40 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|
|
|
|
|
|
|
|
def test_lost_all_shareholders(self):
|
|
|
|
# we have enough shareholders when we start, but one segment in we
|
|
|
|
# lose all of them. The upload should fail.
|
|
|
|
modemap = dict([(i, "lost") for i in range(10)])
|
|
|
|
d = self.send_and_recover((4,8,10), bucket_modes=modemap)
|
|
|
|
def _done(res):
|
|
|
|
self.failUnless(isinstance(res, Failure))
|
2008-10-27 20:34:49 +00:00
|
|
|
self.failUnless(res.check(NotEnoughSharesError))
|
2007-06-06 17:32:40 +00:00
|
|
|
d.addBoth(_done)
|
|
|
|
return d
|