Switch to using pycddl for CBOR decoding.

2025-03-10 22:43:52 +00:00 · 2024-01-24 13:50:55 -05:00 · 2024-01-24 13:50:55 -05:00 · fced1ab01b
commit fced1ab01b
parent 68d63fde27
5 changed files with 15 additions and 39 deletions
--- a/setup.py
+++ b/setup.py
@ -146,9 +146,8 @@ install_requires = [
    # 5.6.0 excluded because https://github.com/agronholm/cbor2/issues/208
    "cbor2 != 5.6.0",

-    # 0.4 adds the ability to pass in mmap() values which greatly reduces the
-    # amount of copying involved.
-    "pycddl >= 0.4",
+    # 0.6 adds the ability to decode CBOR.
+    "pycddl >= 0.6",

    # Command-line parsing
    "click >= 8.1.1",
--- a/src/allmydata/storage/http_client.py
+++ b/src/allmydata/storage/http_client.py
@ -63,7 +63,7 @@ from ..util.hashutil import timing_safe_compare
 from ..util.deferredutil import async_to_deferred
 from ..util.tor_provider import _Provider as TorProvider
 from ..util.cputhreadpool import defer_to_thread
-from ..util.cbor import dumps, loads
+from ..util.cbor import dumps

 try:
    from txtorcon import Tor  # type: ignore
@ -560,8 +560,7 @@ class StorageClient(object):
                    data = f.read()

                    def validate_and_decode():
-                        schema.validate_cbor(data)
-                        return loads(data)
+                        return schema.validate_cbor(data, True)

                    return await defer_to_thread(validate_and_decode)
                else:
--- a/src/allmydata/storage/http_server.py
+++ b/src/allmydata/storage/http_server.py
@ -637,17 +637,8 @@ async def read_encoded(

    # Pycddl will release the GIL when validating larger documents, so
    # let's take advantage of multiple CPUs:
-    await defer_to_thread(schema.validate_cbor, message)
-
-    # The CBOR parser will allocate more memory, but at least we can feed
-    # it the file-like object, so that if it's large it won't be make two
-    # copies.
-    request.content.seek(SEEK_SET, 0)
-    # Typically deserialization to Python will not release the GIL, and
-    # indeed as of Jan 2023 cbor2 didn't have any code to release the GIL
-    # in the decode path. As such, running it in a different thread has no benefit.
-    return cbor.load(request.content)
-
+    decoded = await defer_to_thread(schema.validate_cbor, message, True)
+    return decoded

 class HTTPServer(BaseApp):
    """
--- a/src/allmydata/test/test_storage_http.py
+++ b/src/allmydata/test/test_storage_http.py
@ -41,7 +41,7 @@ from werkzeug.exceptions import NotFound as WNotFound
 from testtools.matchers import Equals
 from zope.interface import implementer

-from ..util.cbor import dumps, loads
+from ..util.cbor import dumps
 from ..util.deferredutil import async_to_deferred
 from ..util.cputhreadpool import disable_thread_pool_for_test
 from .common import SyncTestCase
@ -1835,14 +1835,3 @@ class MutableSharedTests(SharedImmutableMutableTestsMixin, SyncTestCase):
        A read with no range returns the whole mutable.
        """
        return self._read_with_no_range_test(data_length)
-
-    def test_roundtrip_cbor2_encoding_issue(self):
-        """
-        Some versions of cbor2 (5.6.0) don't correctly encode bytestrings
-        bigger than 65535
-        """
-        for size in range(0, 65535*2, 17):
-            self.assertEqual(
-                size,
-                len(loads(dumps(b"\12" * size)))
-            )
--- a/src/allmydata/util/cbor.py
+++ b/src/allmydata/util/cbor.py
@ -1,21 +1,19 @@
 """
 Unified entry point for CBOR encoding and decoding.
-"""

-import sys
+Makes it less likely to use ``cbor2.loads()`` by mistake, which we want to avoid.
+"""

 # We don't want to use the C extension for loading, at least for now, but using
 # it for dumping should be fine.
 from cbor2 import dumps, dump

-# Now, override the C extension so we can import the Python versions of loading
-# functions.
-del sys.modules["cbor2"]
-sys.modules["_cbor2"] = None  # type: ignore[assignment]
-from cbor2 import load, loads
+def load(*args, **kwargs):
+    """
+    Don't use this!  Here just in case someone uses it by mistake.
+    """
+    raise RuntimeError("Use pycddl for decoding CBOR")

-# Quick validation that we got the Python version, not the C version.
-assert type(load) == type(lambda: None), repr(load)   # type: ignore[comparison-overlap]
-assert type(loads) == type(lambda: None), repr(loads)   # type: ignore[comparison-overlap]
+loads = load

 __all__ = ["dumps", "loads", "dump", "load"]