From fced1ab01bb55bc58f521e544d8a2e450a3a9ceb Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 24 Jan 2024 13:50:55 -0500 Subject: [PATCH] Switch to using pycddl for CBOR decoding. --- setup.py | 5 ++--- src/allmydata/storage/http_client.py | 5 ++--- src/allmydata/storage/http_server.py | 13 ++----------- src/allmydata/test/test_storage_http.py | 13 +------------ src/allmydata/util/cbor.py | 18 ++++++++---------- 5 files changed, 15 insertions(+), 39 deletions(-) diff --git a/setup.py b/setup.py index 34d436a49..6011b45fb 100644 --- a/setup.py +++ b/setup.py @@ -146,9 +146,8 @@ install_requires = [ # 5.6.0 excluded because https://github.com/agronholm/cbor2/issues/208 "cbor2 != 5.6.0", - # 0.4 adds the ability to pass in mmap() values which greatly reduces the - # amount of copying involved. - "pycddl >= 0.4", + # 0.6 adds the ability to decode CBOR. + "pycddl >= 0.6", # Command-line parsing "click >= 8.1.1", diff --git a/src/allmydata/storage/http_client.py b/src/allmydata/storage/http_client.py index 10f943fe1..f0570a4d6 100644 --- a/src/allmydata/storage/http_client.py +++ b/src/allmydata/storage/http_client.py @@ -63,7 +63,7 @@ from ..util.hashutil import timing_safe_compare from ..util.deferredutil import async_to_deferred from ..util.tor_provider import _Provider as TorProvider from ..util.cputhreadpool import defer_to_thread -from ..util.cbor import dumps, loads +from ..util.cbor import dumps try: from txtorcon import Tor # type: ignore @@ -560,8 +560,7 @@ class StorageClient(object): data = f.read() def validate_and_decode(): - schema.validate_cbor(data) - return loads(data) + return schema.validate_cbor(data, True) return await defer_to_thread(validate_and_decode) else: diff --git a/src/allmydata/storage/http_server.py b/src/allmydata/storage/http_server.py index 324584f01..2e1a6a413 100644 --- a/src/allmydata/storage/http_server.py +++ b/src/allmydata/storage/http_server.py @@ -637,17 +637,8 @@ async def read_encoded( # Pycddl will release the GIL when validating larger documents, so # let's take advantage of multiple CPUs: - await defer_to_thread(schema.validate_cbor, message) - - # The CBOR parser will allocate more memory, but at least we can feed - # it the file-like object, so that if it's large it won't be make two - # copies. - request.content.seek(SEEK_SET, 0) - # Typically deserialization to Python will not release the GIL, and - # indeed as of Jan 2023 cbor2 didn't have any code to release the GIL - # in the decode path. As such, running it in a different thread has no benefit. - return cbor.load(request.content) - + decoded = await defer_to_thread(schema.validate_cbor, message, True) + return decoded class HTTPServer(BaseApp): """ diff --git a/src/allmydata/test/test_storage_http.py b/src/allmydata/test/test_storage_http.py index 066ec8e84..185cfa995 100644 --- a/src/allmydata/test/test_storage_http.py +++ b/src/allmydata/test/test_storage_http.py @@ -41,7 +41,7 @@ from werkzeug.exceptions import NotFound as WNotFound from testtools.matchers import Equals from zope.interface import implementer -from ..util.cbor import dumps, loads +from ..util.cbor import dumps from ..util.deferredutil import async_to_deferred from ..util.cputhreadpool import disable_thread_pool_for_test from .common import SyncTestCase @@ -1835,14 +1835,3 @@ class MutableSharedTests(SharedImmutableMutableTestsMixin, SyncTestCase): A read with no range returns the whole mutable. """ return self._read_with_no_range_test(data_length) - - def test_roundtrip_cbor2_encoding_issue(self): - """ - Some versions of cbor2 (5.6.0) don't correctly encode bytestrings - bigger than 65535 - """ - for size in range(0, 65535*2, 17): - self.assertEqual( - size, - len(loads(dumps(b"\12" * size))) - ) diff --git a/src/allmydata/util/cbor.py b/src/allmydata/util/cbor.py index aa67bd7e3..a4b33ecec 100644 --- a/src/allmydata/util/cbor.py +++ b/src/allmydata/util/cbor.py @@ -1,21 +1,19 @@ """ Unified entry point for CBOR encoding and decoding. -""" -import sys +Makes it less likely to use ``cbor2.loads()`` by mistake, which we want to avoid. +""" # We don't want to use the C extension for loading, at least for now, but using # it for dumping should be fine. from cbor2 import dumps, dump -# Now, override the C extension so we can import the Python versions of loading -# functions. -del sys.modules["cbor2"] -sys.modules["_cbor2"] = None # type: ignore[assignment] -from cbor2 import load, loads +def load(*args, **kwargs): + """ + Don't use this! Here just in case someone uses it by mistake. + """ + raise RuntimeError("Use pycddl for decoding CBOR") -# Quick validation that we got the Python version, not the C version. -assert type(load) == type(lambda: None), repr(load) # type: ignore[comparison-overlap] -assert type(loads) == type(lambda: None), repr(loads) # type: ignore[comparison-overlap] +loads = load __all__ = ["dumps", "loads", "dump", "load"]