From 1abf944dd21b8c5db052215b4497696ce2eeb592 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 14 Apr 2021 10:38:57 -0400 Subject: [PATCH 1/9] News file. --- newsfragments/3672.minor | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 newsfragments/3672.minor diff --git a/newsfragments/3672.minor b/newsfragments/3672.minor new file mode 100644 index 000000000..e69de29bb From 32607b5ada8065f181c0e83937eaf247d9409256 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 14 Apr 2021 10:42:01 -0400 Subject: [PATCH 2/9] For logging, using a new JSON bytes encoder that works on any bytes string, not just UTF-8-encoded strings. --- src/allmydata/test/__init__.py | 4 +- src/allmydata/test/eliotutil.py | 7 +-- src/allmydata/test/test_eliotutil.py | 4 +- src/allmydata/test/test_util.py | 37 +++++++++++-- src/allmydata/util/eliotutil.py | 6 +-- src/allmydata/util/jsonbytes.py | 78 ++++++++++++++++++++-------- src/allmydata/web/logs.py | 5 +- 7 files changed, 99 insertions(+), 42 deletions(-) diff --git a/src/allmydata/test/__init__.py b/src/allmydata/test/__init__.py index c75f8d003..26b17e997 100644 --- a/src/allmydata/test/__init__.py +++ b/src/allmydata/test/__init__.py @@ -131,5 +131,5 @@ if sys.platform == "win32": initialize() from eliot import to_file -from allmydata.util.jsonbytes import BytesJSONEncoder -to_file(open("eliot.log", "wb"), encoder=BytesJSONEncoder) +from allmydata.util.jsonbytes import AnyBytesJSONEncoder +to_file(open("eliot.log", "wb"), encoder=AnyBytesJSONEncoder) diff --git a/src/allmydata/test/eliotutil.py b/src/allmydata/test/eliotutil.py index c2359f132..35dfb09eb 100644 --- a/src/allmydata/test/eliotutil.py +++ b/src/allmydata/test/eliotutil.py @@ -54,7 +54,7 @@ from twisted.python.monkey import ( MonkeyPatcher, ) -from ..util.jsonbytes import BytesJSONEncoder +from ..util.jsonbytes import AnyBytesJSONEncoder _NAME = Field.for_types( @@ -73,10 +73,7 @@ RUN_TEST = ActionType( # On Python 3, we want to use our custom JSON encoder when validating messages # can be encoded to JSON: -if PY2: - _memory_logger = MemoryLogger -else: - _memory_logger = lambda: MemoryLogger(encoder=BytesJSONEncoder) +_memory_logger = lambda: MemoryLogger(encoder=AnyBytesJSONEncoder) @attr.s diff --git a/src/allmydata/test/test_eliotutil.py b/src/allmydata/test/test_eliotutil.py index aca677323..3f915ecd2 100644 --- a/src/allmydata/test/test_eliotutil.py +++ b/src/allmydata/test/test_eliotutil.py @@ -69,7 +69,7 @@ from ..util.eliotutil import ( _parse_destination_description, _EliotLogging, ) -from ..util.jsonbytes import BytesJSONEncoder +from ..util.jsonbytes import AnyBytesJSONEncoder from .common import ( SyncTestCase, @@ -109,7 +109,7 @@ class ParseDestinationDescriptionTests(SyncTestCase): reactor = object() self.assertThat( _parse_destination_description("file:-")(reactor), - Equals(FileDestination(stdout, encoder=BytesJSONEncoder)), + Equals(FileDestination(stdout, encoder=AnyBytesJSONEncoder)), ) diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py index 9887897cf..8f3a39670 100644 --- a/src/allmydata/test/test_util.py +++ b/src/allmydata/test/test_util.py @@ -495,10 +495,10 @@ class YAML(unittest.TestCase): class JSONBytes(unittest.TestCase): - """Tests for BytesJSONEncoder.""" + """Tests for jsonbytes module.""" def test_encode_bytes(self): - """BytesJSONEncoder can encode bytes. + """jsonbytes.dumps() encodes bytes. Bytes are presumed to be UTF-8 encoded. """ @@ -515,7 +515,7 @@ class JSONBytes(unittest.TestCase): self.assertEqual(jsonbytes.loads(encoded), expected) def test_encode_unicode(self): - """BytesJSONEncoder encodes Unicode string as usual.""" + """jsonbytes.dumps() encodes Unicode string as usual.""" expected = { u"hello": [1, u"cd"], } @@ -529,6 +529,37 @@ class JSONBytes(unittest.TestCase): self.assertIsInstance(encoded, bytes) self.assertEqual(json.loads(encoded, encoding="utf-8"), x) + def test_any_bytes_unsupported_by_default(self): + """By default non-UTF-8 bytes raise error.""" + bytestring = b"abc\xff\x00" + with self.assertRaises(UnicodeDecodeError): + jsonbytes.dumps(bytestring) + with self.assertRaises(UnicodeDecodeError): + jsonbytes.dumps_bytes(bytestring) + with self.assertRaises(UnicodeDecodeError): + json.dumps(bytestring, cls=jsonbytes.UTF8BytesJSONEncoder) + + def test_any_bytes(self): + """If any_bytes is True, non-UTF-8 bytes don't break encoding.""" + bytestring = b"abc\xff" + o = {bytestring: bytestring} + expected = {"abc\\xff": "abc\\xff"} + self.assertEqual( + json.loads(jsonbytes.dumps(o, any_bytes=True)), + expected, + ) + self.assertEqual( + json.loads(json.dumps( + o, cls=jsonbytes.AnyBytesJSONEncoder)), + expected, + ) + self.assertEqual( + json.loads(jsonbytes.dumps(o, any_bytes=True), + encoding="utf-8"), + expected, + ) + + class FakeGetVersion(object): """Emulate an object with a get_version.""" diff --git a/src/allmydata/util/eliotutil.py b/src/allmydata/util/eliotutil.py index 5d144eb1d..4e48fbb9f 100644 --- a/src/allmydata/util/eliotutil.py +++ b/src/allmydata/util/eliotutil.py @@ -87,7 +87,7 @@ from twisted.internet.defer import ( ) from twisted.application.service import Service -from .jsonbytes import BytesJSONEncoder +from .jsonbytes import AnyBytesJSONEncoder def validateInstanceOf(t): @@ -306,7 +306,7 @@ class _DestinationParser(object): rotateLength=rotate_length, maxRotatedFiles=max_rotated_files, ) - return lambda reactor: FileDestination(get_file(), BytesJSONEncoder) + return lambda reactor: FileDestination(get_file(), AnyBytesJSONEncoder) _parse_destination_description = _DestinationParser().parse @@ -333,4 +333,4 @@ def log_call_deferred(action_type): if PY2: capture_logging = eliot_capture_logging else: - capture_logging = partial(eliot_capture_logging, encoder_=BytesJSONEncoder) + capture_logging = partial(eliot_capture_logging, encoder_=AnyBytesJSONEncoder) diff --git a/src/allmydata/util/jsonbytes.py b/src/allmydata/util/jsonbytes.py index c46a932d0..849fd6f0a 100644 --- a/src/allmydata/util/jsonbytes.py +++ b/src/allmydata/util/jsonbytes.py @@ -16,43 +16,75 @@ if PY2: import json -def _bytes_to_unicode(obj): - """Convert any bytes objects to unicode, recursively.""" - if isinstance(obj, bytes): - return obj.decode("utf-8") - if isinstance(obj, dict): - new_obj = {} - for k, v in obj.items(): - if isinstance(k, bytes): - k = k.decode("utf-8") - v = _bytes_to_unicode(v) - new_obj[k] = v - return new_obj - if isinstance(obj, (list, set, tuple)): - return [_bytes_to_unicode(i) for i in obj] - return obj +def _make_bytes_to_unicode(any_bytes): + """Create a function that recursively converts bytes to unicode. - -class BytesJSONEncoder(json.JSONEncoder): + :param any_bytes: If True, also support non-UTF-8-encoded bytes. """ - A JSON encoder than can also encode bytes. + errors = "backslashreplace" if any_bytes else "strict" - The bytes are assumed to be UTF-8 encoded Unicode strings. + def _bytes_to_unicode(obj): + """Convert any bytes objects to unicode, recursively.""" + if isinstance(obj, bytes): + return obj.decode("utf-8", errors=errors) + if isinstance(obj, dict): + new_obj = {} + for k, v in obj.items(): + if isinstance(k, bytes): + k = k.decode("utf-8", errors=errors) + v = _bytes_to_unicode(v) + new_obj[k] = v + return new_obj + if isinstance(obj, (list, set, tuple)): + return [_bytes_to_unicode(i) for i in obj] + return obj + + return _bytes_to_unicode + + +class UTF8BytesJSONEncoder(json.JSONEncoder): + """ + A JSON encoder than can also encode UTF-8 encoded strings. """ def iterencode(self, o, **kwargs): - return json.JSONEncoder.iterencode(self, _bytes_to_unicode(o), **kwargs) + return json.JSONEncoder.iterencode( + self, _make_bytes_to_unicode(False)(o), **kwargs) + + +class AnyBytesJSONEncoder(json.JSONEncoder): + """ + A JSON encoder than can also encode bytes of any sort. + + Bytes are decoded to strings using UTF-8, if that fails to decode then the + bytes are quoted. + """ + def iterencode(self, o, **kwargs): + return json.JSONEncoder.iterencode( + self, _make_bytes_to_unicode(True)(o), **kwargs) def dumps(obj, *args, **kwargs): """Encode to JSON, supporting bytes as keys or values. - The bytes are assumed to be UTF-8 encoded Unicode strings. + :param bool any_bytes: If False (the default) the bytes are assumed to be + UTF-8 encoded Unicode strings. If True, non-UTF-8 bytes are quoted for + human consumption. """ - return json.dumps(obj, cls=BytesJSONEncoder, *args, **kwargs) + any_bytes = kwargs.pop("any_bytes", False) + if any_bytes: + cls = AnyBytesJSONEncoder + else: + cls = UTF8BytesJSONEncoder + return json.dumps(obj, cls=cls, *args, **kwargs) def dumps_bytes(obj, *args, **kwargs): - """Encode to JSON, then encode as bytes.""" + """Encode to JSON, then encode as bytes. + + :param bool all_bytes: If False (the default) the bytes are assumed to be + UTF-8 encoded Unicode strings. If True, non-UTF-8 bytes are quoted for + human consumption. + """ result = dumps(obj, *args, **kwargs) if PY3: result = result.encode("utf-8") diff --git a/src/allmydata/web/logs.py b/src/allmydata/web/logs.py index 9bd59ae53..a79440eb9 100644 --- a/src/allmydata/web/logs.py +++ b/src/allmydata/web/logs.py @@ -47,10 +47,7 @@ class TokenAuthenticatedWebSocketServerProtocol(WebSocketServerProtocol): """ # probably want a try/except around here? what do we do if # transmission fails or anything else bad happens? - encoded = json.dumps(message) - if isinstance(encoded, str): - # On Python 3 dumps() returns Unicode... - encoded = encoded.encode("utf-8") + encoded = json.dumps_bytes(message, any_bytes=True) self.sendMessage(encoded) def onOpen(self): From d60bc2841abe03ff459f9c9c4687da5da2c2ff9d Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 14 Apr 2021 11:19:04 -0400 Subject: [PATCH 3/9] Oh right, Python 2 Eliot doesn't support custom JSON encoders. --- src/allmydata/test/eliotutil.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/allmydata/test/eliotutil.py b/src/allmydata/test/eliotutil.py index 35dfb09eb..1685744fd 100644 --- a/src/allmydata/test/eliotutil.py +++ b/src/allmydata/test/eliotutil.py @@ -73,7 +73,10 @@ RUN_TEST = ActionType( # On Python 3, we want to use our custom JSON encoder when validating messages # can be encoded to JSON: -_memory_logger = lambda: MemoryLogger(encoder=AnyBytesJSONEncoder) +if PY2: + _memory_logger = MemoryLogger +else: + _memory_logger = lambda: MemoryLogger(encoder=AnyBytesJSONEncoder) @attr.s From 51ebbae15a0a97c0784d7b4863d25d660bc04e89 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 16 Apr 2021 11:21:47 -0400 Subject: [PATCH 4/9] Fix typo. --- src/allmydata/test/web/test_logs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/allmydata/test/web/test_logs.py b/src/allmydata/test/web/test_logs.py index d4fa5e944..579e51dbc 100644 --- a/src/allmydata/test/web/test_logs.py +++ b/src/allmydata/test/web/test_logs.py @@ -92,7 +92,7 @@ class TestStreamingLogs(unittest.TestCase): @inlineCallbacks def test_one_log(self): """ - Write a single Eliot log actin and see it streamed via websocket. + Write a single Eliot log action and see it streamed via websocket. """ proto = yield self.agent.open( From bc9e4ac72859c7e79ccce3c18158f6721ee5a269 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 16 Apr 2021 11:36:53 -0400 Subject: [PATCH 5/9] Support quoting any-old-bytes correctly on Python 2. --- src/allmydata/test/test_util.py | 4 ++-- src/allmydata/util/jsonbytes.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py index 8f3a39670..4c2e98683 100644 --- a/src/allmydata/test/test_util.py +++ b/src/allmydata/test/test_util.py @@ -541,9 +541,9 @@ class JSONBytes(unittest.TestCase): def test_any_bytes(self): """If any_bytes is True, non-UTF-8 bytes don't break encoding.""" - bytestring = b"abc\xff" + bytestring = b"abc\xff\xff123" o = {bytestring: bytestring} - expected = {"abc\\xff": "abc\\xff"} + expected = {"abc\\xff\\xff123": "abc\\xff\\xff123"} self.assertEqual( json.loads(jsonbytes.dumps(o, any_bytes=True)), expected, diff --git a/src/allmydata/util/jsonbytes.py b/src/allmydata/util/jsonbytes.py index 849fd6f0a..995165ee6 100644 --- a/src/allmydata/util/jsonbytes.py +++ b/src/allmydata/util/jsonbytes.py @@ -14,7 +14,18 @@ if PY2: from future.builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min # noqa: F401 import json +import codecs +if PY2: + def backslashreplace_py2(ex): + """ + On Python 2 'backslashreplace' error handler doesn't work, so write our + own. + """ + return ''.join('\\x{:02x}'.format(ord(c)) + for c in ex.object[ex.start:ex.end]), ex.end + + codecs.register_error("backslashreplace_tahoe_py2", backslashreplace_py2) def _make_bytes_to_unicode(any_bytes): """Create a function that recursively converts bytes to unicode. @@ -22,6 +33,8 @@ def _make_bytes_to_unicode(any_bytes): :param any_bytes: If True, also support non-UTF-8-encoded bytes. """ errors = "backslashreplace" if any_bytes else "strict" + if PY2 and errors == "backslashreplace": + errors = "backslashreplace_tahoe_py2" def _bytes_to_unicode(obj): """Convert any bytes objects to unicode, recursively.""" From 6e8dde3b14b80f6bdccd6c6ba263d315d9eed7bc Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 21 Apr 2021 10:09:03 -0400 Subject: [PATCH 6/9] Simplify. --- src/allmydata/util/jsonbytes.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/allmydata/util/jsonbytes.py b/src/allmydata/util/jsonbytes.py index 995165ee6..2d3cb7504 100644 --- a/src/allmydata/util/jsonbytes.py +++ b/src/allmydata/util/jsonbytes.py @@ -27,16 +27,18 @@ if PY2: codecs.register_error("backslashreplace_tahoe_py2", backslashreplace_py2) -def _make_bytes_to_unicode(any_bytes): + +def _bytes_to_unicode(any_bytes, obj): """Create a function that recursively converts bytes to unicode. :param any_bytes: If True, also support non-UTF-8-encoded bytes. + :param obj: Object to de-byte-ify. """ errors = "backslashreplace" if any_bytes else "strict" if PY2 and errors == "backslashreplace": errors = "backslashreplace_tahoe_py2" - def _bytes_to_unicode(obj): + def doit(obj): """Convert any bytes objects to unicode, recursively.""" if isinstance(obj, bytes): return obj.decode("utf-8", errors=errors) @@ -45,14 +47,14 @@ def _make_bytes_to_unicode(any_bytes): for k, v in obj.items(): if isinstance(k, bytes): k = k.decode("utf-8", errors=errors) - v = _bytes_to_unicode(v) + v = doit(v) new_obj[k] = v return new_obj if isinstance(obj, (list, set, tuple)): - return [_bytes_to_unicode(i) for i in obj] + return [doit(i) for i in obj] return obj - return _bytes_to_unicode + return doit(obj) class UTF8BytesJSONEncoder(json.JSONEncoder): @@ -61,7 +63,7 @@ class UTF8BytesJSONEncoder(json.JSONEncoder): """ def iterencode(self, o, **kwargs): return json.JSONEncoder.iterencode( - self, _make_bytes_to_unicode(False)(o), **kwargs) + self, _bytes_to_unicode(False, o), **kwargs) class AnyBytesJSONEncoder(json.JSONEncoder): @@ -73,7 +75,7 @@ class AnyBytesJSONEncoder(json.JSONEncoder): """ def iterencode(self, o, **kwargs): return json.JSONEncoder.iterencode( - self, _make_bytes_to_unicode(True)(o), **kwargs) + self, _bytes_to_unicode(True, o), **kwargs) def dumps(obj, *args, **kwargs): From 08cb514eeea1226fca1c642207e70b19d3adf39a Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 21 Apr 2021 10:09:26 -0400 Subject: [PATCH 7/9] Correct parameter name. --- src/allmydata/util/jsonbytes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/allmydata/util/jsonbytes.py b/src/allmydata/util/jsonbytes.py index 2d3cb7504..152b79861 100644 --- a/src/allmydata/util/jsonbytes.py +++ b/src/allmydata/util/jsonbytes.py @@ -96,7 +96,7 @@ def dumps(obj, *args, **kwargs): def dumps_bytes(obj, *args, **kwargs): """Encode to JSON, then encode as bytes. - :param bool all_bytes: If False (the default) the bytes are assumed to be + :param bool any_bytes: If False (the default) the bytes are assumed to be UTF-8 encoded Unicode strings. If True, non-UTF-8 bytes are quoted for human consumption. """ From e090891935a13a4953ced575b60a5189c05808dc Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 21 Apr 2021 10:39:27 -0400 Subject: [PATCH 8/9] In PyPy encode() doesn't call iterencode(). --- src/allmydata/util/jsonbytes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/allmydata/util/jsonbytes.py b/src/allmydata/util/jsonbytes.py index 152b79861..f6143f4d1 100644 --- a/src/allmydata/util/jsonbytes.py +++ b/src/allmydata/util/jsonbytes.py @@ -61,6 +61,10 @@ class UTF8BytesJSONEncoder(json.JSONEncoder): """ A JSON encoder than can also encode UTF-8 encoded strings. """ + def encode(self, o, **kwargs): + return json.JSONEncoder.encode( + self, _bytes_to_unicode(False, o), **kwargs) + def iterencode(self, o, **kwargs): return json.JSONEncoder.iterencode( self, _bytes_to_unicode(False, o), **kwargs) @@ -73,6 +77,10 @@ class AnyBytesJSONEncoder(json.JSONEncoder): Bytes are decoded to strings using UTF-8, if that fails to decode then the bytes are quoted. """ + def encode(self, o, **kwargs): + return json.JSONEncoder.encode( + self, _bytes_to_unicode(True, o), **kwargs) + def iterencode(self, o, **kwargs): return json.JSONEncoder.iterencode( self, _bytes_to_unicode(True, o), **kwargs) From 567c0f019e0a20f3aeb16de46b15ac820c0d3afe Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 22 Apr 2021 09:39:50 -0400 Subject: [PATCH 9/9] Test random bytes. --- src/allmydata/test/web/test_logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/allmydata/test/web/test_logs.py b/src/allmydata/test/web/test_logs.py index 579e51dbc..89ec7ba42 100644 --- a/src/allmydata/test/web/test_logs.py +++ b/src/allmydata/test/web/test_logs.py @@ -109,7 +109,7 @@ class TestStreamingLogs(unittest.TestCase): def do_a_thing(arguments): pass - do_a_thing(arguments=[u"hello", b"good-day", 123, {"a": 35}, [None]]) + do_a_thing(arguments=[u"hello", b"good-\xff-day", 123, {"a": 35}, [None]]) proto.transport.loseConnection() yield proto.is_closed @@ -117,7 +117,7 @@ class TestStreamingLogs(unittest.TestCase): self.assertEqual(len(messages), 2) self.assertEqual(messages[0]["action_type"], "test:cli:some-exciting-action") self.assertEqual(messages[0]["arguments"], - ["hello", "good-day", 123, {"a": 35}, [None]]) + ["hello", "good-\\xff-day", 123, {"a": 35}, [None]]) self.assertEqual(messages[1]["action_type"], "test:cli:some-exciting-action") self.assertEqual("started", messages[0]["action_status"]) self.assertEqual("succeeded", messages[1]["action_status"])