From de2609151e2add0b505f62c8644a5e86a79e7117 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 1 Apr 2021 10:01:14 -0400 Subject: [PATCH] Special logic for roundtripping Unicode to Unicode is only necessary on Python 2. --- src/allmydata/test/test_util.py | 7 ++++++- src/allmydata/util/yamlutil.py | 29 +++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py index a14adb787..4dc2793a4 100644 --- a/src/allmydata/test/test_util.py +++ b/src/allmydata/test/test_util.py @@ -480,7 +480,12 @@ class EqButNotIs(object): class YAML(unittest.TestCase): def test_convert(self): - data = yaml.safe_dump(["str", u"unicode", u"\u1234nicode"]) + """ + Unicode and (ASCII) native strings get roundtripped to Unicode strings. + """ + data = yaml.safe_dump( + [six.ensure_str("str"), u"unicode", u"\u1234nicode"] + ) back = yamlutil.safe_load(data) self.assertIsInstance(back[0], str) self.assertIsInstance(back[1], str) diff --git a/src/allmydata/util/yamlutil.py b/src/allmydata/util/yamlutil.py index 40c38fa30..f7eb8004f 100644 --- a/src/allmydata/util/yamlutil.py +++ b/src/allmydata/util/yamlutil.py @@ -1,11 +1,28 @@ +from future.utils import PY2 + import yaml -# Announcements contain unicode, because they come from JSON. We tell PyYAML -# to give us unicode instead of str/bytes. -def construct_unicode(loader, node): - return node.value -yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str", - construct_unicode) +if PY2: + # On Python 2 the way pyyaml deals with Unicode strings is inconsistent. + # + # >>> yaml.safe_load(yaml.safe_dump(u"hello")) + # 'hello' + # >>> yaml.safe_load(yaml.safe_dump(u"hello\u1234")) + # u'hello\u1234' + # + # In other words, Unicode strings get roundtripped to byte strings, but + # only sometimes. + # + # In order to ensure unicode stays unicode, we add a configuration saying + # that the YAML String Language-Independent Type ("a sequence of zero or + # more Unicode characters") should be the underlying Unicode string object, + # rather than converting to bytes when possible. + # + # Reference: https://yaml.org/type/str.html + def construct_unicode(loader, node): + return node.value + yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str", + construct_unicode) def safe_load(f): return yaml.safe_load(f)