Special logic for roundtripping Unicode to Unicode is only necessary on Python 2.

This commit is contained in:
Itamar Turner-Trauring 2021-04-01 10:01:14 -04:00
parent 8439f2820b
commit de2609151e
2 changed files with 29 additions and 7 deletions

View File

@ -480,7 +480,12 @@ class EqButNotIs(object):
class YAML(unittest.TestCase): class YAML(unittest.TestCase):
def test_convert(self): def test_convert(self):
data = yaml.safe_dump(["str", u"unicode", u"\u1234nicode"]) """
Unicode and (ASCII) native strings get roundtripped to Unicode strings.
"""
data = yaml.safe_dump(
[six.ensure_str("str"), u"unicode", u"\u1234nicode"]
)
back = yamlutil.safe_load(data) back = yamlutil.safe_load(data)
self.assertIsInstance(back[0], str) self.assertIsInstance(back[0], str)
self.assertIsInstance(back[1], str) self.assertIsInstance(back[1], str)

View File

@ -1,11 +1,28 @@
from future.utils import PY2
import yaml import yaml
# Announcements contain unicode, because they come from JSON. We tell PyYAML if PY2:
# to give us unicode instead of str/bytes. # On Python 2 the way pyyaml deals with Unicode strings is inconsistent.
def construct_unicode(loader, node): #
return node.value # >>> yaml.safe_load(yaml.safe_dump(u"hello"))
yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str", # 'hello'
construct_unicode) # >>> yaml.safe_load(yaml.safe_dump(u"hello\u1234"))
# u'hello\u1234'
#
# In other words, Unicode strings get roundtripped to byte strings, but
# only sometimes.
#
# In order to ensure unicode stays unicode, we add a configuration saying
# that the YAML String Language-Independent Type ("a sequence of zero or
# more Unicode characters") should be the underlying Unicode string object,
# rather than converting to bytes when possible.
#
# Reference: https://yaml.org/type/str.html
def construct_unicode(loader, node):
return node.value
yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str",
construct_unicode)
def safe_load(f): def safe_load(f):
return yaml.safe_load(f) return yaml.safe_load(f)