For logging, using a new JSON bytes encoder that works on any bytes string, not

just UTF-8-encoded strings.
2025-04-07 10:56:49 +00:00 · 2021-04-14 10:42:01 -04:00 · 2021-04-14 10:42:01 -04:00 · 32607b5ada
commit 32607b5ada
parent 1abf944dd2
7 changed files with 99 additions and 42 deletions
--- a/src/allmydata/test/init.py
+++ b/src/allmydata/test/init.py
@ -131,5 +131,5 @@ if sys.platform == "win32":
    initialize()

 from eliot import to_file
-from allmydata.util.jsonbytes import BytesJSONEncoder
-to_file(open("eliot.log", "wb"), encoder=BytesJSONEncoder)
+from allmydata.util.jsonbytes import AnyBytesJSONEncoder
+to_file(open("eliot.log", "wb"), encoder=AnyBytesJSONEncoder)
--- a/src/allmydata/test/eliotutil.py
+++ b/src/allmydata/test/eliotutil.py
@ -54,7 +54,7 @@ from twisted.python.monkey import (
    MonkeyPatcher,
 )

-from ..util.jsonbytes import BytesJSONEncoder
+from ..util.jsonbytes import AnyBytesJSONEncoder


 _NAME = Field.for_types(
@ -73,10 +73,7 @@ RUN_TEST = ActionType(

 # On Python 3, we want to use our custom JSON encoder when validating messages
 # can be encoded to JSON:
-if PY2:
-    _memory_logger = MemoryLogger
-else:
-    _memory_logger = lambda: MemoryLogger(encoder=BytesJSONEncoder)
+_memory_logger = lambda: MemoryLogger(encoder=AnyBytesJSONEncoder)


@attr.s
--- a/src/allmydata/test/test_eliotutil.py
+++ b/src/allmydata/test/test_eliotutil.py
@ -69,7 +69,7 @@ from ..util.eliotutil import (
    _parse_destination_description,
    _EliotLogging,
 )
-from ..util.jsonbytes import BytesJSONEncoder
+from ..util.jsonbytes import AnyBytesJSONEncoder

 from .common import (
    SyncTestCase,
@ -109,7 +109,7 @@ class ParseDestinationDescriptionTests(SyncTestCase):
        reactor = object()
        self.assertThat(
            _parse_destination_description("file:-")(reactor),
-            Equals(FileDestination(stdout, encoder=BytesJSONEncoder)),
+            Equals(FileDestination(stdout, encoder=AnyBytesJSONEncoder)),
        )


--- a/src/allmydata/test/test_util.py
+++ b/src/allmydata/test/test_util.py
@ -495,10 +495,10 @@ class YAML(unittest.TestCase):


 class JSONBytes(unittest.TestCase):
-    """Tests for BytesJSONEncoder."""
+    """Tests for jsonbytes module."""

    def test_encode_bytes(self):
-        """BytesJSONEncoder can encode bytes.
+        """jsonbytes.dumps() encodes bytes.

        Bytes are presumed to be UTF-8 encoded.
        """
@ -515,7 +515,7 @@ class JSONBytes(unittest.TestCase):
        self.assertEqual(jsonbytes.loads(encoded), expected)

    def test_encode_unicode(self):
-        """BytesJSONEncoder encodes Unicode string as usual."""
+        """jsonbytes.dumps() encodes Unicode string as usual."""
        expected = {
            u"hello": [1, u"cd"],
        }
@ -529,6 +529,37 @@ class JSONBytes(unittest.TestCase):
        self.assertIsInstance(encoded, bytes)
        self.assertEqual(json.loads(encoded, encoding="utf-8"), x)

+    def test_any_bytes_unsupported_by_default(self):
+        """By default non-UTF-8 bytes raise error."""
+        bytestring = b"abc\xff\x00"
+        with self.assertRaises(UnicodeDecodeError):
+            jsonbytes.dumps(bytestring)
+        with self.assertRaises(UnicodeDecodeError):
+            jsonbytes.dumps_bytes(bytestring)
+        with self.assertRaises(UnicodeDecodeError):
+            json.dumps(bytestring, cls=jsonbytes.UTF8BytesJSONEncoder)
+
+    def test_any_bytes(self):
+        """If any_bytes is True, non-UTF-8 bytes don't break encoding."""
+        bytestring = b"abc\xff"
+        o = {bytestring: bytestring}
+        expected = {"abc\\xff": "abc\\xff"}
+        self.assertEqual(
+            json.loads(jsonbytes.dumps(o, any_bytes=True)),
+            expected,
+        )
+        self.assertEqual(
+            json.loads(json.dumps(
+                o, cls=jsonbytes.AnyBytesJSONEncoder)),
+            expected,
+        )
+        self.assertEqual(
+            json.loads(jsonbytes.dumps(o, any_bytes=True),
+                       encoding="utf-8"),
+            expected,
+        )
+
+

 class FakeGetVersion(object):
    """Emulate an object with a get_version."""
--- a/src/allmydata/util/eliotutil.py
+++ b/src/allmydata/util/eliotutil.py
@ -87,7 +87,7 @@ from twisted.internet.defer import (
 )
 from twisted.application.service import Service

-from .jsonbytes import BytesJSONEncoder
+from .jsonbytes import AnyBytesJSONEncoder


 def validateInstanceOf(t):
@ -306,7 +306,7 @@ class _DestinationParser(object):
                    rotateLength=rotate_length,
                    maxRotatedFiles=max_rotated_files,
                )
-        return lambda reactor: FileDestination(get_file(), BytesJSONEncoder)
+        return lambda reactor: FileDestination(get_file(), AnyBytesJSONEncoder)


 _parse_destination_description = _DestinationParser().parse
@ -333,4 +333,4 @@ def log_call_deferred(action_type):
 if PY2:
    capture_logging = eliot_capture_logging
 else:
-    capture_logging = partial(eliot_capture_logging, encoder_=BytesJSONEncoder)
+    capture_logging = partial(eliot_capture_logging, encoder_=AnyBytesJSONEncoder)
--- a/src/allmydata/util/jsonbytes.py
+++ b/src/allmydata/util/jsonbytes.py
@ -16,43 +16,75 @@ if PY2:
 import json


-def _bytes_to_unicode(obj):
-    """Convert any bytes objects to unicode, recursively."""
-    if isinstance(obj, bytes):
-        return obj.decode("utf-8")
-    if isinstance(obj, dict):
-        new_obj = {}
-        for k, v in obj.items():
-            if isinstance(k, bytes):
-                k = k.decode("utf-8")
-            v = _bytes_to_unicode(v)
-            new_obj[k] = v
-        return new_obj
-    if isinstance(obj, (list, set, tuple)):
-        return [_bytes_to_unicode(i) for i in obj]
-    return obj
+def _make_bytes_to_unicode(any_bytes):
+    """Create a function that recursively converts bytes to unicode.

-
-class BytesJSONEncoder(json.JSONEncoder):
+    :param any_bytes: If True, also support non-UTF-8-encoded bytes.
    """
-    A JSON encoder than can also encode bytes.
+    errors = "backslashreplace" if any_bytes else "strict"

-    The bytes are assumed to be UTF-8 encoded Unicode strings.
+    def _bytes_to_unicode(obj):
+        """Convert any bytes objects to unicode, recursively."""
+        if isinstance(obj, bytes):
+            return obj.decode("utf-8", errors=errors)
+        if isinstance(obj, dict):
+            new_obj = {}
+            for k, v in obj.items():
+                if isinstance(k, bytes):
+                    k = k.decode("utf-8", errors=errors)
+                v = _bytes_to_unicode(v)
+                new_obj[k] = v
+            return new_obj
+        if isinstance(obj, (list, set, tuple)):
+            return [_bytes_to_unicode(i) for i in obj]
+        return obj
+
+    return _bytes_to_unicode
+
+
+class UTF8BytesJSONEncoder(json.JSONEncoder):
+    """
+    A JSON encoder than can also encode UTF-8 encoded strings.
    """
    def iterencode(self, o, **kwargs):
-        return json.JSONEncoder.iterencode(self, _bytes_to_unicode(o), **kwargs)
+        return json.JSONEncoder.iterencode(
+            self, _make_bytes_to_unicode(False)(o), **kwargs)
+
+
+class AnyBytesJSONEncoder(json.JSONEncoder):
+    """
+    A JSON encoder than can also encode bytes of any sort.
+
+    Bytes are decoded to strings using UTF-8, if that fails to decode then the
+    bytes are quoted.
+    """
+    def iterencode(self, o, **kwargs):
+        return json.JSONEncoder.iterencode(
+            self, _make_bytes_to_unicode(True)(o), **kwargs)


 def dumps(obj, *args, **kwargs):
    """Encode to JSON, supporting bytes as keys or values.

-    The bytes are assumed to be UTF-8 encoded Unicode strings.
+    :param bool any_bytes: If False (the default) the bytes are assumed to be
+        UTF-8 encoded Unicode strings.  If True, non-UTF-8 bytes are quoted for
+        human consumption.
    """
-    return json.dumps(obj, cls=BytesJSONEncoder, *args, **kwargs)
+    any_bytes = kwargs.pop("any_bytes", False)
+    if any_bytes:
+        cls = AnyBytesJSONEncoder
+    else:
+        cls = UTF8BytesJSONEncoder
+    return json.dumps(obj, cls=cls, *args, **kwargs)


 def dumps_bytes(obj, *args, **kwargs):
-    """Encode to JSON, then encode as bytes."""
+    """Encode to JSON, then encode as bytes.
+
+    :param bool all_bytes: If False (the default) the bytes are assumed to be
+        UTF-8 encoded Unicode strings.  If True, non-UTF-8 bytes are quoted for
+        human consumption.
+    """
    result = dumps(obj, *args, **kwargs)
    if PY3:
        result = result.encode("utf-8")
--- a/src/allmydata/web/logs.py
+++ b/src/allmydata/web/logs.py
@ -47,10 +47,7 @@ class TokenAuthenticatedWebSocketServerProtocol(WebSocketServerProtocol):
        """
        # probably want a try/except around here? what do we do if
        # transmission fails or anything else bad happens?
-        encoded = json.dumps(message)
-        if isinstance(encoded, str):
-            # On Python 3 dumps() returns Unicode...
-            encoded = encoded.encode("utf-8")
+        encoded = json.dumps_bytes(message, any_bytes=True)
        self.sendMessage(encoded)

    def onOpen(self):