From 53bdc10e195228daf10a1af1cd244148edd42d35 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 12 Aug 2020 13:10:10 -0400 Subject: [PATCH] Some steps towards Python 3 support. --- src/allmydata/test/test_encodingutil.py | 76 ++++++++++++++----------- src/allmydata/util/encodingutil.py | 13 +++-- 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/allmydata/test/test_encodingutil.py b/src/allmydata/test/test_encodingutil.py index d0b9793c4..e0adc2dbf 100644 --- a/src/allmydata/test/test_encodingutil.py +++ b/src/allmydata/test/test_encodingutil.py @@ -1,5 +1,9 @@ from __future__ import print_function +from future.utils import PY2 + +from past.builtins import unicode + lumiere_nfc = u"lumi\u00E8re" Artonwall_nfc = u"\u00C4rtonwall.mp3" Artonwall_nfd = u"A\u0308rtonwall.mp3" @@ -44,7 +48,7 @@ if __name__ == "__main__": open(os.path.join(tmpdir, fname), 'w').close() # Use Unicode API under Windows or MacOS X - if sys.platform in ('win32', 'darwin'): + if PY2 and sys.platform in ('win32', 'darwin'): dirlist = os.listdir(unicode(tmpdir)) else: dirlist = os.listdir(tmpdir) @@ -278,7 +282,11 @@ class StdlibUnicode(unittest.TestCase): fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt' open(fn, 'wb').close() self.failUnless(os.path.exists(fn)) - self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn))) + if PY2: + getcwdu = os.getcwdu + else: + getcwdu = os.getcwd + self.failUnless(os.path.exists(os.path.join(getcwdu(), fn))) filenames = listdir_unicode(lumiere_nfc) # We only require that the listing includes a filename that is canonically equivalent @@ -314,7 +322,7 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) if out[0:2] == 'b"': pass - elif isinstance(inp, str): + elif isinstance(inp, bytes): self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out) self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) else: @@ -326,47 +334,47 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): self._check(inp, out, enc, optional_quotes, quote_newlines) # optional single quotes - check("foo", "'foo'", True) - check("\\", "'\\'", True) - check("$\"`", "'$\"`'", True) - check("\n", "'\n'", True, quote_newlines=False) + check(b"foo", b"'foo'", True) + check(b"\\", b"'\\'", True) + check(b"$\"`", b"'$\"`'", True) + check(b"\n", b"'\n'", True, quote_newlines=False) # mandatory single quotes - check("\"", "'\"'") + check(b"\"", b"'\"'") # double quotes - check("'", "\"'\"") - check("\n", "\"\\x0a\"", quote_newlines=True) - check("\x00", "\"\\x00\"") + check(b"'", b"\"'\"") + check(b"\n", b"\"\\x0a\"", quote_newlines=True) + check(b"\x00", b"\"\\x00\"") # invalid Unicode and astral planes - check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"") - check(u"\uDC00\uD800", "\"\\udc00\\ud800\"") - check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"") - check(u"\uD800\uDC00", "\"\\U00010000\"") - check(u"\uD800\uDC01", "\"\\U00010001\"") - check(u"\uD801\uDC00", "\"\\U00010400\"") - check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"") - check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"") - check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"") + check(u"\uFDD0\uFDEF", b"\"\\ufdd0\\ufdef\"") + check(u"\uDC00\uD800", b"\"\\udc00\\ud800\"") + check(u"\uDC00\uD800\uDC00", b"\"\\udc00\\U00010000\"") + check(u"\uD800\uDC00", b"\"\\U00010000\"") + check(u"\uD800\uDC01", b"\"\\U00010001\"") + check(u"\uD801\uDC00", b"\"\\U00010400\"") + check(u"\uDBFF\uDFFF", b"\"\\U0010ffff\"") + check(u"'\uDBFF\uDFFF", b"\"'\\U0010ffff\"") + check(u"\"\uDBFF\uDFFF", b"\"\\\"\\U0010ffff\"") # invalid UTF-8 - check("\xFF", "b\"\\xff\"") - check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") + check("\xFF", b"b\"\\xff\"") + check("\x00\"$\\`\x80\xFF", b"b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") def test_quote_output_ascii(self, enc='ascii'): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out, enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) - check(u"\u00D7", "\"\\xd7\"") - check(u"'\u00D7", "\"'\\xd7\"") - check(u"\"\u00D7", "\"\\\"\\xd7\"") - check(u"\u2621", "\"\\u2621\"") - check(u"'\u2621", "\"'\\u2621\"") - check(u"\"\u2621", "\"\\\"\\u2621\"") - check(u"\n", "'\n'", True, quote_newlines=False) - check(u"\n", "\"\\x0a\"", quote_newlines=True) + check(u"\u00D7", b"\"\\xd7\"") + check(u"'\u00D7", b"\"'\\xd7\"") + check(u"\"\u00D7", b"\"\\\"\\xd7\"") + check(u"\u2621", b"\"\\u2621\"") + check(u"'\u2621", b"\"'\\u2621\"") + check(u"\"\u2621", b"\"\\\"\\u2621\"") + check(u"\n", b"'\n'", True, quote_newlines=False) + check(u"\n", b"\"\\x0a\"", quote_newlines=True) def test_quote_output_latin1(self, enc='latin1'): def check(inp, out, optional_quotes=False, quote_newlines=None): @@ -548,14 +556,14 @@ class OpenBSD(EncodingUtil, unittest.TestCase): class TestToFromStr(ReallyEqualMixin, unittest.TestCase): def test_to_str(self): - self.failUnlessReallyEqual(to_str("foo"), "foo") - self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re") + self.failUnlessReallyEqual(to_str("foo"), b"foo") + self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), b"lumi\xc3\xa8re") self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want? - self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re") + self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), b"lumi\xc3\xa8re") self.failUnlessReallyEqual(to_str(None), None) def test_from_utf8_or_none(self): self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo") - self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re") + self.failUnlessReallyEqual(from_utf8_or_none(b"lumi\xc3\xa8re"), u"lumi\u00E8re") self.failUnlessReallyEqual(from_utf8_or_none(None), None) self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF") diff --git a/src/allmydata/util/encodingutil.py b/src/allmydata/util/encodingutil.py index 0ec6511bf..d795eb43c 100644 --- a/src/allmydata/util/encodingutil.py +++ b/src/allmydata/util/encodingutil.py @@ -3,6 +3,8 @@ Functions used to convert inputs from whatever encoding used in the system to unicode and back. """ +from future.utils import PY2 + from past.builtins import unicode import sys, os, re, locale @@ -65,13 +67,13 @@ def _reload(): check_encoding(io_encoding) - is_unicode_platform = sys.platform in ["win32", "darwin"] + is_unicode_platform = not PY2 or sys.platform in ["win32", "darwin"] # Despite the Unicode-mode FilePath support added to Twisted in # , we can't yet use # Unicode-mode FilePaths with INotify on non-Windows platforms # due to . - use_unicode_filepath = sys.platform == "win32" + use_unicode_filepath = not PY2 or sys.platform == "win32" _reload() @@ -92,6 +94,9 @@ def argv_to_unicode(s): """ Decode given argv element to unicode. If this fails, raise a UsageError. """ + if isinstance(s, unicode): + return s + precondition(isinstance(s, bytes), s) try: @@ -122,7 +127,7 @@ def unicode_to_argv(s, mangle=False): if mangle and sys.platform == "win32": # This must be the same as 'mangle' in bin/tahoe-script.template. - return str(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s)) + return bytes(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s)) else: return s.encode(io_encoding) @@ -143,7 +148,7 @@ def to_str(s): # TODO rename to to_bytes return s.encode('utf-8') def from_utf8_or_none(s): - precondition(isinstance(s, str) or s is None, s) + precondition(isinstance(s, bytes) or s is None, s) if s is None: return s return s.decode('utf-8')