Merge pull request #1190 from tahoe-lafs/3802-cddl

CDDL-based schema validation for the HTTP storage server

Fixes ticket:3802
This commit is contained in:
Itamar Turner-Trauring 2022-04-13 14:07:32 -04:00 committed by GitHub
commit 0c18dca1ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 152 additions and 22 deletions

View File

@ -350,6 +350,9 @@ Because of the simple types used throughout
and the equivalence described in `RFC 7049`_
these examples should be representative regardless of which of these two encodings is chosen.
For CBOR messages, any sequence that is semantically a set (i.e. no repeated values allowed, order doesn't matter, and elements are hashable in Python) should be sent as a set.
Tag 6.258 is used to indicate sets in CBOR; see `the CBOR registry <https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml>`_ for more details.
HTTP Design
~~~~~~~~~~~

0
newsfragments/3802.minor Normal file
View File

View File

@ -135,7 +135,8 @@ install_requires = [
"klein",
"werkzeug",
"treq",
"cbor2"
"cbor2",
"pycddl",
]
setup_requires = [

View File

@ -11,6 +11,7 @@ import attr
# TODO Make sure to import Python version?
from cbor2 import loads, dumps
from pycddl import Schema
from collections_extended import RangeMap
from werkzeug.datastructures import Range, ContentRange
from twisted.web.http_headers import Headers
@ -36,18 +37,69 @@ class ClientException(Exception):
self.code = code
def _decode_cbor(response):
# Schemas for server responses.
#
# Tags are of the form #6.nnn, where the number is documented at
# https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml. Notably, #6.258
# indicates a set.
_SCHEMAS = {
"get_version": Schema(
"""
message = {'http://allmydata.org/tahoe/protocols/storage/v1' => {
'maximum-immutable-share-size' => uint
'maximum-mutable-share-size' => uint
'available-space' => uint
'tolerates-immutable-read-overrun' => bool
'delete-mutable-shares-with-zero-length-writev' => bool
'fills-holes-with-zero-bytes' => bool
'prevents-read-past-end-of-share-data' => bool
}
'application-version' => bstr
}
"""
),
"allocate_buckets": Schema(
"""
message = {
already-have: #6.258([* uint])
allocated: #6.258([* uint])
}
"""
),
"immutable_write_share_chunk": Schema(
"""
message = {
required: [* {begin: uint, end: uint}]
}
"""
),
"list_shares": Schema(
"""
message = #6.258([* uint])
"""
),
}
def _decode_cbor(response, schema: Schema):
"""Given HTTP response, return decoded CBOR body."""
def got_content(data):
schema.validate_cbor(data)
return loads(data)
if response.code > 199 and response.code < 300:
content_type = get_content_type(response.headers)
if content_type == CBOR_MIME_TYPE:
# TODO limit memory usage
# https://tahoe-lafs.org/trac/tahoe-lafs/ticket/3872
return treq.content(response).addCallback(loads)
return treq.content(response).addCallback(got_content)
else:
raise ClientException(-1, "Server didn't send CBOR")
else:
return fail(ClientException(response.code, response.phrase))
return treq.content(response).addCallback(
lambda data: fail(ClientException(response.code, response.phrase, data))
)
@attr.s
@ -149,7 +201,7 @@ class StorageClientGeneral(object):
"""
url = self._client.relative_url("/v1/version")
response = yield self._client.request("GET", url)
decoded_response = yield _decode_cbor(response)
decoded_response = yield _decode_cbor(response, _SCHEMAS["get_version"])
returnValue(decoded_response)
@ -207,7 +259,7 @@ class StorageClientImmutables(object):
upload_secret=upload_secret,
message_to_serialize=message,
)
decoded_response = yield _decode_cbor(response)
decoded_response = yield _decode_cbor(response, _SCHEMAS["allocate_buckets"])
returnValue(
ImmutableCreateResult(
already_have=decoded_response["already-have"],
@ -279,7 +331,7 @@ class StorageClientImmutables(object):
raise ClientException(
response.code,
)
body = yield _decode_cbor(response)
body = yield _decode_cbor(response, _SCHEMAS["immutable_write_share_chunk"])
remaining = RangeMap()
for chunk in body["required"]:
remaining.set(True, chunk["begin"], chunk["end"])
@ -332,7 +384,7 @@ class StorageClientImmutables(object):
url,
)
if response.code == http.OK:
body = yield _decode_cbor(response)
body = yield _decode_cbor(response, _SCHEMAS["list_shares"])
returnValue(set(body))
else:
raise ClientException(response.code)

View File

@ -21,7 +21,7 @@ from werkzeug.datastructures import ContentRange
# TODO Make sure to use pure Python versions?
from cbor2 import dumps, loads
from pycddl import Schema, ValidationError as CDDLValidationError
from .server import StorageServer
from .http_common import swissnum_auth_header, Secrets, get_content_type, CBOR_MIME_TYPE
from .common import si_a2b
@ -86,8 +86,8 @@ def _authorization_decorator(required_secrets):
try:
secrets = _extract_secrets(authorization, required_secrets)
except ClientSecretsException:
request.setResponseCode(400)
return b""
request.setResponseCode(http.BAD_REQUEST)
return b"Missing required secrets"
return f(self, request, secrets, *args, **kwargs)
return route
@ -215,6 +215,25 @@ class _HTTPError(Exception):
self.code = code
# CDDL schemas.
#
# Tags are of the form #6.nnn, where the number is documented at
# https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml. Notably, #6.258
# indicates a set.
_SCHEMAS = {
"allocate_buckets": Schema("""
message = {
share-numbers: #6.258([* uint])
allocated-size: uint
}
"""),
"advise_corrupt_share": Schema("""
message = {
reason: tstr
}
""")
}
class HTTPServer(object):
"""
A HTTP interface to the storage server.
@ -229,6 +248,12 @@ class HTTPServer(object):
request.setResponseCode(failure.value.code)
return b""
@_app.handle_errors(CDDLValidationError)
def _cddl_validation_error(self, request, failure):
"""Handle CDDL validation errors."""
request.setResponseCode(http.BAD_REQUEST)
return str(failure.value).encode("utf-8")
def __init__(
self, storage_server, swissnum
): # type: (StorageServer, bytes) -> None
@ -268,7 +293,7 @@ class HTTPServer(object):
# https://tahoe-lafs.org/trac/tahoe-lafs/ticket/3861
raise _HTTPError(http.NOT_ACCEPTABLE)
def _read_encoded(self, request) -> Any:
def _read_encoded(self, request, schema: Schema) -> Any:
"""
Read encoded request body data, decoding it with CBOR by default.
"""
@ -276,7 +301,10 @@ class HTTPServer(object):
if content_type == CBOR_MIME_TYPE:
# TODO limit memory usage, client could send arbitrarily large data...
# https://tahoe-lafs.org/trac/tahoe-lafs/ticket/3872
return loads(request.content.read())
message = request.content.read()
schema.validate_cbor(message)
result = loads(message)
return result
else:
raise _HTTPError(http.UNSUPPORTED_MEDIA_TYPE)
@ -298,7 +326,7 @@ class HTTPServer(object):
def allocate_buckets(self, request, authorization, storage_index):
"""Allocate buckets."""
upload_secret = authorization[Secrets.UPLOAD]
info = self._read_encoded(request)
info = self._read_encoded(request, _SCHEMAS["allocate_buckets"])
# We do NOT validate the upload secret for existing bucket uploads.
# Another upload may be happening in parallel, with a different upload
@ -408,7 +436,7 @@ class HTTPServer(object):
"""
List shares for the given storage index.
"""
share_numbers = list(self._storage_server.get_buckets(storage_index).keys())
share_numbers = set(self._storage_server.get_buckets(storage_index).keys())
return self._send_encoded(request, share_numbers)
@_authorized_route(
@ -498,6 +526,6 @@ class HTTPServer(object):
except KeyError:
raise _HTTPError(http.NOT_FOUND)
info = self._read_encoded(request)
info = self._read_encoded(request, _SCHEMAS["advise_corrupt_share"])
bucket.advise_corrupt_share(info["reason"].encode("utf-8"))
return b""

View File

@ -18,6 +18,8 @@ from base64 import b64encode
from contextlib import contextmanager
from os import urandom
from cbor2 import dumps
from pycddl import ValidationError as CDDLValidationError
from hypothesis import assume, given, strategies as st
from fixtures import Fixture, TempDir
from treq.testing import StubTreq
@ -49,7 +51,7 @@ from ..storage.http_client import (
StorageClientGeneral,
_encode_si,
)
from ..storage.http_common import get_content_type
from ..storage.http_common import get_content_type, CBOR_MIME_TYPE
from ..storage.common import si_b2a
@ -239,6 +241,12 @@ class TestApp(object):
else:
return "BAD: {}".format(authorization)
@_authorized_route(_app, set(), "/v1/version", methods=["GET"])
def bad_version(self, request, authorization):
"""Return version result that violates the expected schema."""
request.setHeader("content-type", CBOR_MIME_TYPE)
return dumps({"garbage": 123})
def result_of(d):
"""
@ -257,15 +265,15 @@ def result_of(d):
)
class RoutingTests(SyncTestCase):
class CustomHTTPServerTests(SyncTestCase):
"""
Tests for the HTTP routing infrastructure.
Tests that use a custom HTTP server.
"""
def setUp(self):
if PY2:
self.skipTest("Not going to bother supporting Python 2")
super(RoutingTests, self).setUp()
super(CustomHTTPServerTests, self).setUp()
# Could be a fixture, but will only be used in this test class so not
# going to bother:
self._http_server = TestApp()
@ -277,8 +285,8 @@ class RoutingTests(SyncTestCase):
def test_authorization_enforcement(self):
"""
The requirement for secrets is enforced; if they are not given, a 400
response code is returned.
The requirement for secrets is enforced by the ``_authorized_route``
decorator; if they are not given, a 400 response code is returned.
"""
# Without secret, get a 400 error.
response = result_of(
@ -298,6 +306,14 @@ class RoutingTests(SyncTestCase):
self.assertEqual(response.code, 200)
self.assertEqual(result_of(response.content()), b"GOOD SECRET")
def test_client_side_schema_validation(self):
"""
The client validates returned CBOR message against a schema.
"""
client = StorageClientGeneral(self.client)
with self.assertRaises(CDDLValidationError):
result_of(client.get_version())
class HttpTestFixture(Fixture):
"""
@ -413,6 +429,36 @@ class GenericHTTPAPITests(SyncTestCase):
)
self.assertEqual(version, expected_version)
def test_server_side_schema_validation(self):
"""
Ensure that schema validation is happening: invalid CBOR should result
in bad request response code (error 400).
We don't bother checking every single request, the API on the
server-side is designed to require a schema, so it validates
everywhere. But we check at least one to ensure we get correct
response code on bad input, so we know validation happened.
"""
upload_secret = urandom(32)
lease_secret = urandom(32)
storage_index = urandom(16)
url = self.http.client.relative_url(
"/v1/immutable/" + _encode_si(storage_index)
)
message = {"bad-message": "missing expected keys"}
response = result_of(
self.http.client.request(
"POST",
url,
lease_renew_secret=lease_secret,
lease_cancel_secret=lease_secret,
upload_secret=upload_secret,
message_to_serialize=message,
)
)
self.assertEqual(response.code, http.BAD_REQUEST)
class ImmutableHTTPAPITests(SyncTestCase):
"""