Ensure and test (and necessary refactor) that lack of content-type is same as

CBOR content-type, as per spec.
2024-12-19 13:07:56 +00:00 · 2023-07-25 15:31:30 -04:00 · 2023-07-25 15:31:30 -04:00 · 46d10a6281
commit 46d10a6281
parent bf2451bbcd
2 changed files with 108 additions and 57 deletions
--- a/src/allmydata/storage/http_server.py
+++ b/src/allmydata/storage/http_server.py
@ -530,6 +530,60 @@ def _add_error_handling(app: Klein):
        return str(failure.value).encode("utf-8")
 async def read_encoded(
    reactor, request, schema: Schema, max_size: int = 1024 * 1024
 ) -> Any:
    """
    Read encoded request body data, decoding it with CBOR by default.
    Somewhat arbitrarily, limit body size to 1MiB by default.
    """
    content_type = get_content_type(request.requestHeaders)
    if content_type is None:
        content_type = CBOR_MIME_TYPE
    if content_type != CBOR_MIME_TYPE:
        raise _HTTPError(http.UNSUPPORTED_MEDIA_TYPE)
    # Make sure it's not too large:
    request.content.seek(0, SEEK_END)
    size = request.content.tell()
    if size > max_size:
        raise _HTTPError(http.REQUEST_ENTITY_TOO_LARGE)
    request.content.seek(0, SEEK_SET)
    # We don't want to load the whole message into memory, cause it might
    # be quite large. The CDDL validator takes a read-only bytes-like
    # thing. Luckily, for large request bodies twisted.web will buffer the
    # data in a file, so we can use mmap() to get a memory view. The CDDL
    # validator will not make a copy, so it won't increase memory usage
    # beyond that.
    try:
        fd = request.content.fileno()
    except (ValueError, OSError):
        fd = -1
    if fd >= 0:
        # It's a file, so we can use mmap() to save memory.
        message = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
    else:
        message = request.content.read()
    # Pycddl will release the GIL when validating larger documents, so
    # let's take advantage of multiple CPUs:
    if size > 10_000:
        await defer_to_thread(reactor, schema.validate_cbor, message)
    else:
        schema.validate_cbor(message)
    # The CBOR parser will allocate more memory, but at least we can feed
    # it the file-like object, so that if it's large it won't be make two
    # copies.
    request.content.seek(SEEK_SET, 0)
    # Typically deserialization to Python will not release the GIL, and
    # indeed as of Jan 2023 cbor2 didn't have any code to release the GIL
    # in the decode path. As such, running it in a different thread has no benefit.
    return cbor2.load(request.content)
 class HTTPServer(object):
    """
    A HTTP interface to the storage server.
@ -587,56 +641,6 @@ class HTTPServer(object):
            # https://tahoe-lafs.org/trac/tahoe-lafs/ticket/3861
            raise _HTTPError(http.NOT_ACCEPTABLE)
    async def _read_encoded(
        self, request, schema: Schema, max_size: int = 1024 * 1024
    ) -> Any:
        """
        Read encoded request body data, decoding it with CBOR by default.
        Somewhat arbitrarily, limit body size to 1MiB by default.
        """
        content_type = get_content_type(request.requestHeaders)
        if content_type != CBOR_MIME_TYPE:
            raise _HTTPError(http.UNSUPPORTED_MEDIA_TYPE)
        # Make sure it's not too large:
        request.content.seek(0, SEEK_END)
        size = request.content.tell()
        if size > max_size:
            raise _HTTPError(http.REQUEST_ENTITY_TOO_LARGE)
        request.content.seek(0, SEEK_SET)
        # We don't want to load the whole message into memory, cause it might
        # be quite large. The CDDL validator takes a read-only bytes-like
        # thing. Luckily, for large request bodies twisted.web will buffer the
        # data in a file, so we can use mmap() to get a memory view. The CDDL
        # validator will not make a copy, so it won't increase memory usage
        # beyond that.
        try:
            fd = request.content.fileno()
        except (ValueError, OSError):
            fd = -1
        if fd >= 0:
            # It's a file, so we can use mmap() to save memory.
            message = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
        else:
            message = request.content.read()
        # Pycddl will release the GIL when validating larger documents, so
        # let's take advantage of multiple CPUs:
        if size > 10_000:
            await defer_to_thread(self._reactor, schema.validate_cbor, message)
        else:
            schema.validate_cbor(message)
        # The CBOR parser will allocate more memory, but at least we can feed
        # it the file-like object, so that if it's large it won't be make two
        # copies.
        request.content.seek(SEEK_SET, 0)
        # Typically deserialization to Python will not release the GIL, and
        # indeed as of Jan 2023 cbor2 didn't have any code to release the GIL
        # in the decode path. As such, running it in a different thread has no benefit.
        return cbor2.load(request.content)
    ##### Generic APIs #####
@ -677,8 +681,8 @@ class HTTPServer(object):
        """Allocate buckets."""
        upload_secret = authorization[Secrets.UPLOAD]
        # It's just a list of up to ~256 shares, shouldn't use many bytes.
-        info = await self._read_encoded(
+        info = await read_encoded(
-            request, _SCHEMAS["allocate_buckets"], max_size=8192
+            self._reactor, request, _SCHEMAS["allocate_buckets"], max_size=8192
        )
        # We do NOT validate the upload secret for existing bucket uploads.
@ -849,7 +853,8 @@ class HTTPServer(object):
        # The reason can be a string with explanation, so in theory it could be
        # longish?
-        info = await self._read_encoded(
+        info = await read_encoded(
            self._reactor,
            request,
            _SCHEMAS["advise_corrupt_share"],
            max_size=32768,
@ -868,8 +873,8 @@ class HTTPServer(object):
    @async_to_deferred
    async def mutable_read_test_write(self, request, authorization, storage_index):
        """Read/test/write combined operation for mutables."""
-        rtw_request = await self._read_encoded(
+        rtw_request = await read_encoded(
-            request, _SCHEMAS["mutable_read_test_write"], max_size=2**48
+            self._reactor, request, _SCHEMAS["mutable_read_test_write"], max_size=2**48
        )
        secrets = (
            authorization[Secrets.WRITE_ENABLER],
@ -955,8 +960,8 @@ class HTTPServer(object):
        # The reason can be a string with explanation, so in theory it could be
        # longish?
-        info = await self._read_encoded(
+        info = await read_encoded(
-            request, _SCHEMAS["advise_corrupt_share"], max_size=32768
+            self._reactor, request, _SCHEMAS["advise_corrupt_share"], max_size=32768
        )
        self._storage_server.advise_corrupt_share(
            b"mutable", storage_index, share_number, info["reason"].encode("utf-8")
--- a/src/allmydata/test/test_storage_http.py
+++ b/src/allmydata/test/test_storage_http.py
@ -42,6 +42,7 @@ from werkzeug.exceptions import NotFound as WNotFound
 from testtools.matchers import Equals
 from zope.interface import implementer
 from ..util.deferredutil import async_to_deferred
 from .common import SyncTestCase
 from ..storage.http_common import (
    get_content_type,
@ -59,6 +60,8 @@ from ..storage.http_server import (
    _authorized_route,
    StorageIndexConverter,
    _add_error_handling,
    read_encoded,
    _SCHEMAS as SERVER_SCHEMAS,
 )
 from ..storage.http_client import (
    StorageClient,
@ -303,6 +306,14 @@ class TestApp(object):
        request.transport.loseConnection()
        return Deferred()
    @_authorized_route(_app, set(), "/read_body", methods=["POST"])
    @async_to_deferred
    async def read_body(self, request, authorization):
        data = await read_encoded(
            self.clock, request, SERVER_SCHEMAS["advise_corrupt_share"]
        )
        return data["reason"]
 def result_of(d):
    """
@ -320,6 +331,7 @@ def result_of(d):
        + "This is probably a test design issue."
    )
 class CustomHTTPServerTests(SyncTestCase):
    """
    Tests that use a custom HTTP server.
@ -504,6 +516,40 @@ class CustomHTTPServerTests(SyncTestCase):
            result_of(d)
        self.assertEqual(len(self._http_server.clock.getDelayedCalls()), 0)
    def test_request_with_no_content_type_same_as_cbor(self):
        """
        If no ``Content-Type`` header is set when sending a body, it is assumed
        to be CBOR.
        """
        response = result_of(
            self.client.request(
                "POST",
                DecodedURL.from_text("http://127.0.0.1/read_body"),
                data=dumps({"reason": "test"}),
            )
        )
        self.assertEqual(
            result_of(limited_content(response, self._http_server.clock, 100)).read(),
            b"test",
        )
    def test_request_with_wrong_content(self):
        """
        If a non-CBOR ``Content-Type`` header is set when sending a body, the
        server complains appropriatly.
        """
        headers = Headers()
        headers.setRawHeaders("content-type", ["some/value"])
        response = result_of(
            self.client.request(
                "POST",
                DecodedURL.from_text("http://127.0.0.1/read_body"),
                data=dumps({"reason": "test"}),
                headers=headers,
            )
        )
        self.assertEqual(response.code, http.UNSUPPORTED_MEDIA_TYPE)
@implementer(IReactorFromThreads)
 class Reactor(Clock):