diff --git a/newsfragments/3376.minor b/newsfragments/3376.minor new file mode 100644 index 000000000..e69de29bb diff --git a/src/allmydata/dirnode.py b/src/allmydata/dirnode.py index f1c95697b..38cb26caf 100644 --- a/src/allmydata/dirnode.py +++ b/src/allmydata/dirnode.py @@ -1,5 +1,5 @@ """Directory Node implementation.""" -import time, unicodedata +import time from zope.interface import implementer from twisted.internet import defer @@ -18,7 +18,7 @@ from allmydata.check_results import DeepCheckResults, \ DeepCheckAndRepairResults from allmydata.monitor import Monitor from allmydata.util import hashutil, base32, log -from allmydata.util.encodingutil import quote_output +from allmydata.util.encodingutil import quote_output, normalize from allmydata.util.assertutil import precondition from allmydata.util.netstring import netstring, split_netstring from allmydata.util.consumer import download_to_data @@ -101,12 +101,6 @@ def update_metadata(metadata, new_metadata, now): return metadata -# 'x' at the end of a variable name indicates that it holds a Unicode string that may not -# be NFC-normalized. - -def normalize(namex): - return unicodedata.normalize('NFC', namex) - # TODO: {Deleter,MetadataSetter,Adder}.modify all start by unpacking the # contents and end by repacking them. It might be better to apply them to # the unpacked contents. diff --git a/src/allmydata/test/common_py3.py b/src/allmydata/test/common_py3.py index 0dae05aa6..94f63b4da 100644 --- a/src/allmydata/test/common_py3.py +++ b/src/allmydata/test/common_py3.py @@ -13,11 +13,17 @@ from future.utils import PY2 if PY2: from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min # noqa: F401 +from past.builtins import unicode + import os import time import signal from twisted.internet import reactor +from twisted.trial import unittest + +from ..util.assertutil import precondition +from ..util.encodingutil import unicode_platform, get_filesystem_encoding class TimezoneMixin(object): @@ -65,3 +71,20 @@ class SignalMixin(object): if self.sigchldHandler: signal.signal(signal.SIGCHLD, self.sigchldHandler) return super(SignalMixin, self).tearDown() + + +class ReallyEqualMixin(object): + def failUnlessReallyEqual(self, a, b, msg=None): + self.assertEqual(a, b, msg) + self.assertEqual(type(a), type(b), "a :: %r, b :: %r, %r" % (a, b, msg)) + + +def skip_if_cannot_represent_filename(u): + precondition(isinstance(u, unicode)) + + enc = get_filesystem_encoding() + if not unicode_platform(): + try: + u.encode(enc) + except UnicodeEncodeError: + raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.") diff --git a/src/allmydata/test/common_util.py b/src/allmydata/test/common_util.py index 4e25e897e..110122b25 100644 --- a/src/allmydata/test/common_util.py +++ b/src/allmydata/test/common_util.py @@ -9,24 +9,16 @@ from twisted.python import failure from twisted.trial import unittest from ..util.assertutil import precondition -from allmydata.util.encodingutil import (unicode_platform, get_filesystem_encoding, - - get_io_encoding) +from allmydata.util.encodingutil import get_io_encoding from future.utils import PY2 if PY2: # XXX this is a hack that makes some tests pass on Python3, remove # in the future from ..scripts import runner -from .common_py3 import SignalMixin +# Imported for backwards compatibility: +from .common_py3 import ( + SignalMixin, skip_if_cannot_represent_filename, ReallyEqualMixin, +) -def skip_if_cannot_represent_filename(u): - precondition(isinstance(u, unicode)) - - enc = get_filesystem_encoding() - if not unicode_platform(): - try: - u.encode(enc) - except UnicodeEncodeError: - raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.") def skip_if_cannot_represent_argv(u): precondition(isinstance(u, unicode)) @@ -87,12 +79,6 @@ def flip_one_bit(s, offset=0, size=None): return result -class ReallyEqualMixin(object): - def failUnlessReallyEqual(self, a, b, msg=None): - self.assertEqual(a, b, msg) - self.assertEqual(type(a), type(b), "a :: %r, b :: %r, %r" % (a, b, msg)) - - class StallMixin(object): def stall(self, res=None, delay=1): d = defer.Deferred() @@ -186,3 +172,11 @@ except ImportError: os.chmod(path, stat.S_IWRITE | stat.S_IEXEC | stat.S_IREAD) make_readonly = _make_readonly make_accessible = _make_accessible + + +__all__ = [ + "make_readonly", "make_accessible", "TestMixin", "ShouldFailMixin", + "StallMixin", "skip_if_cannot_represent_argv", "run_cli", "parse_cli", + "DevNullDictionary", "insecurerandstr", "flip_bit", "flip_one_bit", + "SignalMixin", "skip_if_cannot_represent_filename", "ReallyEqualMixin" +] diff --git a/src/allmydata/test/test_encodingutil.py b/src/allmydata/test/test_encodingutil.py index 771e9d879..cf72fc9ac 100644 --- a/src/allmydata/test/test_encodingutil.py +++ b/src/allmydata/test/test_encodingutil.py @@ -1,4 +1,14 @@ from __future__ import print_function +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals + +from future.utils import PY2, PY3 +if PY2: + # We don't import str because omg way too ambiguous in this context. + from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, max, min # noqa: F401 + +from past.builtins import unicode lumiere_nfc = u"lumi\u00E8re" Artonwall_nfc = u"\u00C4rtonwall.mp3" @@ -43,8 +53,10 @@ if __name__ == "__main__": for fname in TEST_FILENAMES: open(os.path.join(tmpdir, fname), 'w').close() - # Use Unicode API under Windows or MacOS X - if sys.platform in ('win32', 'darwin'): + # On Python 2, listing directories returns unicode under Windows or + # MacOS X if the input is unicode. On Python 3, it always returns + # Unicode. + if PY2 and sys.platform in ('win32', 'darwin'): dirlist = os.listdir(unicode(tmpdir)) else: dirlist = os.listdir(tmpdir) @@ -59,20 +71,22 @@ if __name__ == "__main__": import os, sys, locale +from unittest import skipIf from twisted.trial import unittest from twisted.python.filepath import FilePath -from allmydata.test.common_util import ReallyEqualMixin +from allmydata.test.common_py3 import ( + ReallyEqualMixin, skip_if_cannot_represent_filename, +) from allmydata.util import encodingutil, fileutil from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \ unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \ quote_filepath, unicode_platform, listdir_unicode, FilenameEncodingError, \ get_io_encoding, get_filesystem_encoding, to_str, from_utf8_or_none, _reload, \ - to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from -from allmydata.dirnode import normalize -from .common_util import skip_if_cannot_represent_filename + to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from, \ + unicode_to_argv from twisted.python import usage @@ -90,7 +104,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase): mock_stdout.encoding = 'cp65001' _reload() - self.failUnlessReallyEqual(get_io_encoding(), 'utf-8') + self.assertEqual(get_io_encoding(), 'utf-8') mock_stdout.encoding = 'koi8-r' expected = sys.platform == "win32" and 'utf-8' or 'koi8-r' @@ -122,7 +136,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase): preferredencoding = None _reload() - self.failUnlessReallyEqual(get_io_encoding(), 'utf-8') + self.assertEqual(get_io_encoding(), 'utf-8') def test_argv_to_unicode(self): encodingutil.io_encoding = 'utf-8' @@ -150,6 +164,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase): # The following tests apply only to platforms that don't store filenames as # Unicode entities on the filesystem. class EncodingUtilNonUnicodePlatform(unittest.TestCase): + @skipIf(PY3, "Python 3 is always Unicode, regardless of OS.") def setUp(self): # Mock sys.platform because unicode_platform() uses it self.original_platform = sys.platform @@ -211,7 +226,7 @@ class EncodingUtil(ReallyEqualMixin): self.failUnlessReallyEqual(argv_to_unicode(argv), argu) def test_unicode_to_url(self): - self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re") + self.failUnless(unicode_to_url(lumiere_nfc), b"lumi\xc3\xa8re") def test_unicode_to_output(self): if 'argv' not in dir(self): @@ -224,7 +239,18 @@ class EncodingUtil(ReallyEqualMixin): _reload() self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv) - def test_unicode_platform(self): + @skipIf(PY3, "Python 2 only.") + def test_unicode_to_argv_py2(self): + """unicode_to_argv() converts to bytes on Python 2.""" + self.assertEqual(unicode_to_argv("abc"), u"abc".encode(self.io_encoding)) + + @skipIf(PY2, "Python 3 only.") + def test_unicode_to_argv_py3(self): + """unicode_to_argv() is noop on Python 3.""" + self.assertEqual(unicode_to_argv("abc"), "abc") + + @skipIf(PY3, "Python 3 only.") + def test_unicode_platform_py2(self): matrix = { 'linux2': False, 'linux3': False, @@ -236,6 +262,11 @@ class EncodingUtil(ReallyEqualMixin): _reload() self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform]) + @skipIf(PY2, "Python 3 isn't Python 2.") + def test_unicode_platform_py3(self): + _reload() + self.failUnlessReallyEqual(unicode_platform(), True) + def test_listdir_unicode(self): if 'dirlist' not in dir(self): return @@ -248,7 +279,14 @@ class EncodingUtil(ReallyEqualMixin): % (self.filesystem_encoding,)) def call_os_listdir(path): - return self.dirlist + if PY2: + return self.dirlist + else: + # Python 3 always lists unicode filenames: + return [d.decode(self.filesystem_encoding) if isinstance(d, bytes) + else d + for d in self.dirlist] + self.patch(os, 'listdir', call_os_listdir) def call_sys_getfilesystemencoding(): @@ -258,7 +296,7 @@ class EncodingUtil(ReallyEqualMixin): _reload() filenames = listdir_unicode(u'/dummy') - self.failUnlessEqual(set([normalize(fname) for fname in filenames]), + self.failUnlessEqual(set([encodingutil.normalize(fname) for fname in filenames]), set(TEST_FILENAMES)) @@ -278,12 +316,16 @@ class StdlibUnicode(unittest.TestCase): fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt' open(fn, 'wb').close() self.failUnless(os.path.exists(fn)) - self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn))) + if PY2: + getcwdu = os.getcwdu + else: + getcwdu = os.getcwd + self.failUnless(os.path.exists(os.path.join(getcwdu(), fn))) filenames = listdir_unicode(lumiere_nfc) # We only require that the listing includes a filename that is canonically equivalent # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent). - self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames])) + self.failUnlessIn(lumiere_nfc + u".txt", set([encodingutil.normalize(fname) for fname in filenames])) expanded = fileutil.expanduser(u"~/" + lumiere_nfc) self.failIfIn(u"~", expanded) @@ -314,59 +356,70 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) if out[0:2] == 'b"': pass - elif isinstance(inp, str): - self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out) - self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) + elif isinstance(inp, bytes): + try: + unicode_inp = inp.decode("utf-8") + except UnicodeDecodeError: + # Some things decode on Python 2, but not Python 3... + return + self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quote_newlines=quote_newlines), out) + self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) else: - self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out) - self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) + try: + bytes_inp = inp.encode('utf-8') + except UnicodeEncodeError: + # Some things encode on Python 2, but not Python 3, e.g. + # surrogates like u"\uDC00\uD800"... + return + self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quote_newlines=quote_newlines), out) + self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) def _test_quote_output_all(self, enc): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out, enc, optional_quotes, quote_newlines) # optional single quotes - check("foo", "'foo'", True) - check("\\", "'\\'", True) - check("$\"`", "'$\"`'", True) - check("\n", "'\n'", True, quote_newlines=False) + check(b"foo", b"'foo'", True) + check(b"\\", b"'\\'", True) + check(b"$\"`", b"'$\"`'", True) + check(b"\n", b"'\n'", True, quote_newlines=False) # mandatory single quotes - check("\"", "'\"'") + check(b"\"", b"'\"'") # double quotes - check("'", "\"'\"") - check("\n", "\"\\x0a\"", quote_newlines=True) - check("\x00", "\"\\x00\"") + check(b"'", b"\"'\"") + check(b"\n", b"\"\\x0a\"", quote_newlines=True) + check(b"\x00", b"\"\\x00\"") # invalid Unicode and astral planes - check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"") - check(u"\uDC00\uD800", "\"\\udc00\\ud800\"") - check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"") - check(u"\uD800\uDC00", "\"\\U00010000\"") - check(u"\uD800\uDC01", "\"\\U00010001\"") - check(u"\uD801\uDC00", "\"\\U00010400\"") - check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"") - check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"") - check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"") + check(u"\uFDD0\uFDEF", b"\"\\ufdd0\\ufdef\"") + check(u"\uDC00\uD800", b"\"\\udc00\\ud800\"") + check(u"\uDC00\uD800\uDC00", b"\"\\udc00\\U00010000\"") + check(u"\uD800\uDC00", b"\"\\U00010000\"") + check(u"\uD800\uDC01", b"\"\\U00010001\"") + check(u"\uD801\uDC00", b"\"\\U00010400\"") + check(u"\uDBFF\uDFFF", b"\"\\U0010ffff\"") + check(u"'\uDBFF\uDFFF", b"\"'\\U0010ffff\"") + check(u"\"\uDBFF\uDFFF", b"\"\\\"\\U0010ffff\"") # invalid UTF-8 - check("\xFF", "b\"\\xff\"") - check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") + check(b"\xFF", b"b\"\\xff\"") + check(b"\x00\"$\\`\x80\xFF", b"b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") def test_quote_output_ascii(self, enc='ascii'): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out, enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) - check(u"\u00D7", "\"\\xd7\"") - check(u"'\u00D7", "\"'\\xd7\"") - check(u"\"\u00D7", "\"\\\"\\xd7\"") - check(u"\u2621", "\"\\u2621\"") - check(u"'\u2621", "\"'\\u2621\"") - check(u"\"\u2621", "\"\\\"\\u2621\"") - check(u"\n", "'\n'", True, quote_newlines=False) - check(u"\n", "\"\\x0a\"", quote_newlines=True) + check(u"\u00D7", b"\"\\xd7\"") + check(u"'\u00D7", b"\"'\\xd7\"") + check(u"\"\u00D7", b"\"\\\"\\xd7\"") + check(u"\u2621", b"\"\\u2621\"") + check(u"'\u2621", b"\"'\\u2621\"") + check(u"\"\u2621", b"\"\\\"\\u2621\"") + check(u"\n", b"'\n'", True, quote_newlines=False) + check(u"\n", b"\"\\x0a\"", quote_newlines=True) def test_quote_output_latin1(self, enc='latin1'): def check(inp, out, optional_quotes=False, quote_newlines=None): @@ -411,43 +464,43 @@ def win32_other(win32, other): class QuotePaths(ReallyEqualMixin, unittest.TestCase): def test_quote_path(self): - self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'") - self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'") - self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar") - self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"') - self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"') - self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"') + self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), b"'foo/bar'") + self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), b"'foo/bar'") + self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), b"foo/bar") + self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), b'"foo/\\x0abar"') + self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), b'"foo/\\x0abar"') + self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), b'"foo/\\x0abar"') self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"), - win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'")) + win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True), - win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'")) + win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False), - win32_other("C:\\foo", "\\\\?\\C:\\foo")) + win32_other(b"C:\\foo", b"\\\\?\\C:\\foo")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"), - win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'")) + win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True), - win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'")) + win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False), - win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar")) + win32_other(b"\\\\foo\\bar", b"\\\\?\\UNC\\foo\\bar")) def test_quote_filepath(self): foo_bar_fp = FilePath(win32_other(u'C:\\foo\\bar', u'/foo/bar')) self.failUnlessReallyEqual(quote_filepath(foo_bar_fp), - win32_other("'C:\\foo\\bar'", "'/foo/bar'")) + win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'")) self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=True), - win32_other("'C:\\foo\\bar'", "'/foo/bar'")) + win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'")) self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=False), - win32_other("C:\\foo\\bar", "/foo/bar")) + win32_other(b"C:\\foo\\bar", b"/foo/bar")) if sys.platform == "win32": foo_longfp = FilePath(u'\\\\?\\C:\\foo') self.failUnlessReallyEqual(quote_filepath(foo_longfp), - "'C:\\foo'") + b"'C:\\foo'") self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=True), - "'C:\\foo'") + b"'C:\\foo'") self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=False), - "C:\\foo") + b"C:\\foo") class FilePaths(ReallyEqualMixin, unittest.TestCase): @@ -501,23 +554,23 @@ class FilePaths(ReallyEqualMixin, unittest.TestCase): class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' - argv = 'lumi\xc3\xa8re' + argv = b'lumi\xc3\xa8re' platform = 'linux2' filesystem_encoding = 'UTF-8' io_encoding = 'UTF-8' - dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt'] + dirlist = [b'test_file', b'\xc3\x84rtonwall.mp3', b'Blah blah.txt'] class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' - argv = 'lumi\xe8re' + argv = b'lumi\xe8re' platform = 'linux2' filesystem_encoding = 'ISO-8859-1' io_encoding = 'ISO-8859-1' - dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3'] + dirlist = [b'test_file', b'Blah blah.txt', b'\xc4rtonwall.mp3'] class Windows(EncodingUtil, unittest.TestCase): uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' - argv = 'lumi\xc3\xa8re' + argv = b'lumi\xc3\xa8re' platform = 'win32' filesystem_encoding = 'mbcs' io_encoding = 'utf-8' @@ -525,7 +578,7 @@ class Windows(EncodingUtil, unittest.TestCase): class MacOSXLeopard(EncodingUtil, unittest.TestCase): uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' - output = 'lumi\xc3\xa8re' + output = b'lumi\xc3\xa8re' platform = 'darwin' filesystem_encoding = 'utf-8' io_encoding = 'UTF-8' @@ -548,14 +601,14 @@ class OpenBSD(EncodingUtil, unittest.TestCase): class TestToFromStr(ReallyEqualMixin, unittest.TestCase): def test_to_str(self): - self.failUnlessReallyEqual(to_str("foo"), "foo") - self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re") - self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want? - self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re") + self.failUnlessReallyEqual(to_str(b"foo"), b"foo") + self.failUnlessReallyEqual(to_str(b"lumi\xc3\xa8re"), b"lumi\xc3\xa8re") + self.failUnlessReallyEqual(to_str(b"\xFF"), b"\xFF") # passes through invalid UTF-8 -- is this what we want? + self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), b"lumi\xc3\xa8re") self.failUnlessReallyEqual(to_str(None), None) def test_from_utf8_or_none(self): self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo") - self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re") + self.failUnlessReallyEqual(from_utf8_or_none(b"lumi\xc3\xa8re"), u"lumi\u00E8re") self.failUnlessReallyEqual(from_utf8_or_none(None), None) - self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF") + self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, b"\xFF") diff --git a/src/allmydata/util/_python3.py b/src/allmydata/util/_python3.py index 52d7ddd77..f85d7d28e 100644 --- a/src/allmydata/util/_python3.py +++ b/src/allmydata/util/_python3.py @@ -41,6 +41,7 @@ PORTED_MODULES = [ "allmydata.util.deferredutil", "allmydata.util.fileutil", "allmydata.util.dictutil", + "allmydata.util.encodingutil", "allmydata.util.gcutil", "allmydata.util.hashutil", "allmydata.util.humanreadable", @@ -64,6 +65,7 @@ PORTED_TEST_MODULES = [ "allmydata.test.test_crypto", "allmydata.test.test_deferredutil", "allmydata.test.test_dictutil", + "allmydata.test.test_encodingutil", "allmydata.test.test_happiness", "allmydata.test.test_hashtree", "allmydata.test.test_hashutil", diff --git a/src/allmydata/util/encodingutil.py b/src/allmydata/util/encodingutil.py index a6a2b97a5..07416cc93 100644 --- a/src/allmydata/util/encodingutil.py +++ b/src/allmydata/util/encodingutil.py @@ -1,9 +1,26 @@ """ Functions used to convert inputs from whatever encoding used in the system to unicode and back. + +Ported to Python 3. + +Once Python 2 support is dropped, most of this module will obsolete, since +Unicode is the default everywhere in Python 3. """ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from future.utils import PY2, PY3, native_str +if PY2: + # We omit str() because that seems too tricky to get right. + from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, max, min # noqa: F401 + +from past.builtins import unicode import sys, os, re, locale +import unicodedata from allmydata.util.assertutil import precondition, _assert from twisted.python import usage @@ -62,13 +79,14 @@ def _reload(): check_encoding(io_encoding) - is_unicode_platform = sys.platform in ["win32", "darwin"] + is_unicode_platform = PY3 or sys.platform in ["win32", "darwin"] # Despite the Unicode-mode FilePath support added to Twisted in # , we can't yet use # Unicode-mode FilePaths with INotify on non-Windows platforms - # due to . - use_unicode_filepath = sys.platform == "win32" + # due to . Supposedly + # 7928 is fixed, though... + use_unicode_filepath = PY3 or sys.platform == "win32" _reload() @@ -89,7 +107,10 @@ def argv_to_unicode(s): """ Decode given argv element to unicode. If this fails, raise a UsageError. """ - precondition(isinstance(s, str), s) + if isinstance(s, unicode): + return s + + precondition(isinstance(s, bytes), s) try: return unicode(s, io_encoding) @@ -114,18 +135,22 @@ def unicode_to_argv(s, mangle=False): If the argument is to be passed to a different process, then the 'mangle' argument should be true; on Windows, this uses a mangled encoding that will be reversed by code in runner.py. + + On Python 3, just return the string unchanged, since argv is unicode. """ precondition(isinstance(s, unicode), s) + if PY3: + return s if mangle and sys.platform == "win32": # This must be the same as 'mangle' in bin/tahoe-script.template. - return str(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s)) + return bytes(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s), io_encoding) else: return s.encode(io_encoding) def unicode_to_url(s): """ - Encode an unicode object used in an URL. + Encode an unicode object used in an URL to bytes. """ # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded. @@ -134,19 +159,19 @@ def unicode_to_url(s): #precondition(isinstance(s, unicode), s) #return s.encode('utf-8') -def to_str(s): - if s is None or isinstance(s, str): +def to_str(s): # TODO rename to to_bytes + if s is None or isinstance(s, bytes): return s return s.encode('utf-8') def from_utf8_or_none(s): - precondition(isinstance(s, str) or s is None, s) + precondition(isinstance(s, bytes) or s is None, s) if s is None: return s return s.decode('utf-8') -PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL) -PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) +PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$', re.DOTALL) +PRINTABLE_8BIT = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) def is_printable_ascii(s): return PRINTABLE_ASCII.search(s) is not None @@ -160,14 +185,14 @@ def unicode_to_output(s): try: out = s.encode(io_encoding) except (UnicodeEncodeError, UnicodeDecodeError): - raise UnicodeEncodeError(io_encoding, s, 0, 0, - "A string could not be encoded as %s for output to the terminal:\n%r" % - (io_encoding, repr(s))) + raise UnicodeEncodeError(native_str(io_encoding), s, 0, 0, + native_str("A string could not be encoded as %s for output to the terminal:\n%r" % + (io_encoding, repr(s)))) if PRINTABLE_8BIT.search(out) is None: - raise UnicodeEncodeError(io_encoding, s, 0, 0, - "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" % - (io_encoding, repr(s))) + raise UnicodeEncodeError(native_str(io_encoding), s, 0, 0, + native_str("A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" % + (io_encoding, repr(s)))) return out @@ -188,14 +213,17 @@ def _unicode_escape(m, quote_newlines): else: return u'\\x%02x' % (codepoint,) -def _str_escape(m, quote_newlines): +def _str_escape(m, quote_newlines): # TODO rename to _bytes_escape + """ + Takes a re match on bytes, the result is escaped bytes of group(0). + """ c = m.group(0) - if c == '"' or c == '$' or c == '`' or c == '\\': - return '\\' + c - elif c == '\n' and not quote_newlines: + if c == b'"' or c == b'$' or c == b'`' or c == b'\\': + return b'\\' + c + elif c == b'\n' and not quote_newlines: return c else: - return '\\x%02x' % (ord(c),) + return b'\\x%02x' % (ord(c),) MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) MUST_DOUBLE_QUOTE = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) @@ -205,7 +233,7 @@ ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid sur u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) -ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) +ESCAPABLE_8BIT = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): """ @@ -221,32 +249,32 @@ def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): If not explicitly given, quote_newlines is True when quotemarks is True. """ - precondition(isinstance(s, (str, unicode)), s) + precondition(isinstance(s, (bytes, unicode)), s) if quote_newlines is None: quote_newlines = quotemarks - if isinstance(s, str): + if isinstance(s, bytes): try: s = s.decode('utf-8') except UnicodeDecodeError: - return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),) + return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),) must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE if must_double_quote.search(s) is None: try: out = s.encode(encoding or io_encoding) - if quotemarks or out.startswith('"'): - return "'%s'" % (out,) + if quotemarks or out.startswith(b'"'): + return b"'%s'" % (out,) else: return out except (UnicodeDecodeError, UnicodeEncodeError): pass escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s) - return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),) + return b'"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),) def quote_path(path, quotemarks=True): - return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True) + return quote_output(b"/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True) def quote_local_unicode_path(path, quotemarks=True): precondition(isinstance(path, unicode), path) @@ -275,7 +303,7 @@ def extend_filepath(fp, segments): return fp def to_filepath(path): - precondition(isinstance(path, unicode if use_unicode_filepath else basestring), + precondition(isinstance(path, unicode if use_unicode_filepath else (bytes, unicode)), path=path) if isinstance(path, unicode) and not use_unicode_filepath: @@ -290,7 +318,7 @@ def to_filepath(path): return FilePath(path) def _decode(s): - precondition(isinstance(s, basestring), s=s) + precondition(isinstance(s, (bytes, unicode)), s=s) if isinstance(s, bytes): return s.decode(filesystem_encoding) @@ -356,3 +384,9 @@ def listdir_unicode(path): def listdir_filepath(fp): return listdir_unicode(unicode_from_filepath(fp)) + + +# 'x' at the end of a variable name indicates that it holds a Unicode string that may not +# be NFC-normalized. +def normalize(namex): + return unicodedata.normalize('NFC', namex)