Special logic for roundtripping Unicode to Unicode is only necessary on Python 2.

This commit is contained in:
Itamar Turner-Trauring 2021-04-01 10:01:14 -04:00
parent 8439f2820b
commit de2609151e
2 changed files with 29 additions and 7 deletions

View File

@ -480,7 +480,12 @@ class EqButNotIs(object):
class YAML(unittest.TestCase):
def test_convert(self):
data = yaml.safe_dump(["str", u"unicode", u"\u1234nicode"])
"""
Unicode and (ASCII) native strings get roundtripped to Unicode strings.
"""
data = yaml.safe_dump(
[six.ensure_str("str"), u"unicode", u"\u1234nicode"]
)
back = yamlutil.safe_load(data)
self.assertIsInstance(back[0], str)
self.assertIsInstance(back[1], str)

View File

@ -1,11 +1,28 @@
from future.utils import PY2
import yaml
# Announcements contain unicode, because they come from JSON. We tell PyYAML
# to give us unicode instead of str/bytes.
def construct_unicode(loader, node):
return node.value
yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str",
construct_unicode)
if PY2:
# On Python 2 the way pyyaml deals with Unicode strings is inconsistent.
#
# >>> yaml.safe_load(yaml.safe_dump(u"hello"))
# 'hello'
# >>> yaml.safe_load(yaml.safe_dump(u"hello\u1234"))
# u'hello\u1234'
#
# In other words, Unicode strings get roundtripped to byte strings, but
# only sometimes.
#
# In order to ensure unicode stays unicode, we add a configuration saying
# that the YAML String Language-Independent Type ("a sequence of zero or
# more Unicode characters") should be the underlying Unicode string object,
# rather than converting to bytes when possible.
#
# Reference: https://yaml.org/type/str.html
def construct_unicode(loader, node):
return node.value
yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str",
construct_unicode)
def safe_load(f):
return yaml.safe_load(f)