Merge pull request #774 from tahoe-lafs/3376.encodingutil-python-3

Port encodingutil to Python 3, part 1

Fixes ticket:3376
This commit is contained in:
Itamar Turner-Trauring 2020-08-14 14:30:19 -04:00 committed by GitHub
commit 59b5f8c0b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 235 additions and 135 deletions

0
newsfragments/3376.minor Normal file
View File

View File

@ -1,5 +1,5 @@
"""Directory Node implementation."""
import time, unicodedata
import time
from zope.interface import implementer
from twisted.internet import defer
@ -18,7 +18,7 @@ from allmydata.check_results import DeepCheckResults, \
DeepCheckAndRepairResults
from allmydata.monitor import Monitor
from allmydata.util import hashutil, base32, log
from allmydata.util.encodingutil import quote_output
from allmydata.util.encodingutil import quote_output, normalize
from allmydata.util.assertutil import precondition
from allmydata.util.netstring import netstring, split_netstring
from allmydata.util.consumer import download_to_data
@ -101,12 +101,6 @@ def update_metadata(metadata, new_metadata, now):
return metadata
# 'x' at the end of a variable name indicates that it holds a Unicode string that may not
# be NFC-normalized.
def normalize(namex):
return unicodedata.normalize('NFC', namex)
# TODO: {Deleter,MetadataSetter,Adder}.modify all start by unpacking the
# contents and end by repacking them. It might be better to apply them to
# the unpacked contents.

View File

@ -13,11 +13,17 @@ from future.utils import PY2
if PY2:
from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min # noqa: F401
from past.builtins import unicode
import os
import time
import signal
from twisted.internet import reactor
from twisted.trial import unittest
from ..util.assertutil import precondition
from ..util.encodingutil import unicode_platform, get_filesystem_encoding
class TimezoneMixin(object):
@ -65,3 +71,20 @@ class SignalMixin(object):
if self.sigchldHandler:
signal.signal(signal.SIGCHLD, self.sigchldHandler)
return super(SignalMixin, self).tearDown()
class ReallyEqualMixin(object):
def failUnlessReallyEqual(self, a, b, msg=None):
self.assertEqual(a, b, msg)
self.assertEqual(type(a), type(b), "a :: %r, b :: %r, %r" % (a, b, msg))
def skip_if_cannot_represent_filename(u):
precondition(isinstance(u, unicode))
enc = get_filesystem_encoding()
if not unicode_platform():
try:
u.encode(enc)
except UnicodeEncodeError:
raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")

View File

@ -9,24 +9,16 @@ from twisted.python import failure
from twisted.trial import unittest
from ..util.assertutil import precondition
from allmydata.util.encodingutil import (unicode_platform, get_filesystem_encoding,
get_io_encoding)
from allmydata.util.encodingutil import get_io_encoding
from future.utils import PY2
if PY2: # XXX this is a hack that makes some tests pass on Python3, remove
# in the future
from ..scripts import runner
from .common_py3 import SignalMixin
# Imported for backwards compatibility:
from .common_py3 import (
SignalMixin, skip_if_cannot_represent_filename, ReallyEqualMixin,
)
def skip_if_cannot_represent_filename(u):
precondition(isinstance(u, unicode))
enc = get_filesystem_encoding()
if not unicode_platform():
try:
u.encode(enc)
except UnicodeEncodeError:
raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
def skip_if_cannot_represent_argv(u):
precondition(isinstance(u, unicode))
@ -87,12 +79,6 @@ def flip_one_bit(s, offset=0, size=None):
return result
class ReallyEqualMixin(object):
def failUnlessReallyEqual(self, a, b, msg=None):
self.assertEqual(a, b, msg)
self.assertEqual(type(a), type(b), "a :: %r, b :: %r, %r" % (a, b, msg))
class StallMixin(object):
def stall(self, res=None, delay=1):
d = defer.Deferred()
@ -186,3 +172,11 @@ except ImportError:
os.chmod(path, stat.S_IWRITE | stat.S_IEXEC | stat.S_IREAD)
make_readonly = _make_readonly
make_accessible = _make_accessible
__all__ = [
"make_readonly", "make_accessible", "TestMixin", "ShouldFailMixin",
"StallMixin", "skip_if_cannot_represent_argv", "run_cli", "parse_cli",
"DevNullDictionary", "insecurerandstr", "flip_bit", "flip_one_bit",
"SignalMixin", "skip_if_cannot_represent_filename", "ReallyEqualMixin"
]

View File

@ -1,4 +1,14 @@
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from future.utils import PY2, PY3
if PY2:
# We don't import str because omg way too ambiguous in this context.
from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, max, min # noqa: F401
from past.builtins import unicode
lumiere_nfc = u"lumi\u00E8re"
Artonwall_nfc = u"\u00C4rtonwall.mp3"
@ -43,8 +53,10 @@ if __name__ == "__main__":
for fname in TEST_FILENAMES:
open(os.path.join(tmpdir, fname), 'w').close()
# Use Unicode API under Windows or MacOS X
if sys.platform in ('win32', 'darwin'):
# On Python 2, listing directories returns unicode under Windows or
# MacOS X if the input is unicode. On Python 3, it always returns
# Unicode.
if PY2 and sys.platform in ('win32', 'darwin'):
dirlist = os.listdir(unicode(tmpdir))
else:
dirlist = os.listdir(tmpdir)
@ -59,20 +71,22 @@ if __name__ == "__main__":
import os, sys, locale
from unittest import skipIf
from twisted.trial import unittest
from twisted.python.filepath import FilePath
from allmydata.test.common_util import ReallyEqualMixin
from allmydata.test.common_py3 import (
ReallyEqualMixin, skip_if_cannot_represent_filename,
)
from allmydata.util import encodingutil, fileutil
from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
quote_filepath, unicode_platform, listdir_unicode, FilenameEncodingError, \
get_io_encoding, get_filesystem_encoding, to_str, from_utf8_or_none, _reload, \
to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from
from allmydata.dirnode import normalize
from .common_util import skip_if_cannot_represent_filename
to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from, \
unicode_to_argv
from twisted.python import usage
@ -90,7 +104,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
mock_stdout.encoding = 'cp65001'
_reload()
self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
self.assertEqual(get_io_encoding(), 'utf-8')
mock_stdout.encoding = 'koi8-r'
expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
@ -122,7 +136,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
preferredencoding = None
_reload()
self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
self.assertEqual(get_io_encoding(), 'utf-8')
def test_argv_to_unicode(self):
encodingutil.io_encoding = 'utf-8'
@ -150,6 +164,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
# The following tests apply only to platforms that don't store filenames as
# Unicode entities on the filesystem.
class EncodingUtilNonUnicodePlatform(unittest.TestCase):
@skipIf(PY3, "Python 3 is always Unicode, regardless of OS.")
def setUp(self):
# Mock sys.platform because unicode_platform() uses it
self.original_platform = sys.platform
@ -211,7 +226,7 @@ class EncodingUtil(ReallyEqualMixin):
self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
def test_unicode_to_url(self):
self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
self.failUnless(unicode_to_url(lumiere_nfc), b"lumi\xc3\xa8re")
def test_unicode_to_output(self):
if 'argv' not in dir(self):
@ -224,7 +239,18 @@ class EncodingUtil(ReallyEqualMixin):
_reload()
self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
def test_unicode_platform(self):
@skipIf(PY3, "Python 2 only.")
def test_unicode_to_argv_py2(self):
"""unicode_to_argv() converts to bytes on Python 2."""
self.assertEqual(unicode_to_argv("abc"), u"abc".encode(self.io_encoding))
@skipIf(PY2, "Python 3 only.")
def test_unicode_to_argv_py3(self):
"""unicode_to_argv() is noop on Python 3."""
self.assertEqual(unicode_to_argv("abc"), "abc")
@skipIf(PY3, "Python 3 only.")
def test_unicode_platform_py2(self):
matrix = {
'linux2': False,
'linux3': False,
@ -236,6 +262,11 @@ class EncodingUtil(ReallyEqualMixin):
_reload()
self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
@skipIf(PY2, "Python 3 isn't Python 2.")
def test_unicode_platform_py3(self):
_reload()
self.failUnlessReallyEqual(unicode_platform(), True)
def test_listdir_unicode(self):
if 'dirlist' not in dir(self):
return
@ -248,7 +279,14 @@ class EncodingUtil(ReallyEqualMixin):
% (self.filesystem_encoding,))
def call_os_listdir(path):
return self.dirlist
if PY2:
return self.dirlist
else:
# Python 3 always lists unicode filenames:
return [d.decode(self.filesystem_encoding) if isinstance(d, bytes)
else d
for d in self.dirlist]
self.patch(os, 'listdir', call_os_listdir)
def call_sys_getfilesystemencoding():
@ -258,7 +296,7 @@ class EncodingUtil(ReallyEqualMixin):
_reload()
filenames = listdir_unicode(u'/dummy')
self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
self.failUnlessEqual(set([encodingutil.normalize(fname) for fname in filenames]),
set(TEST_FILENAMES))
@ -278,12 +316,16 @@ class StdlibUnicode(unittest.TestCase):
fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
open(fn, 'wb').close()
self.failUnless(os.path.exists(fn))
self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
if PY2:
getcwdu = os.getcwdu
else:
getcwdu = os.getcwd
self.failUnless(os.path.exists(os.path.join(getcwdu(), fn)))
filenames = listdir_unicode(lumiere_nfc)
# We only require that the listing includes a filename that is canonically equivalent
# to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
self.failUnlessIn(lumiere_nfc + u".txt", set([encodingutil.normalize(fname) for fname in filenames]))
expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
self.failIfIn(u"~", expanded)
@ -314,59 +356,70 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
if out[0:2] == 'b"':
pass
elif isinstance(inp, str):
self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
elif isinstance(inp, bytes):
try:
unicode_inp = inp.decode("utf-8")
except UnicodeDecodeError:
# Some things decode on Python 2, but not Python 3...
return
self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quote_newlines=quote_newlines), out)
self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
else:
self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
try:
bytes_inp = inp.encode('utf-8')
except UnicodeEncodeError:
# Some things encode on Python 2, but not Python 3, e.g.
# surrogates like u"\uDC00\uD800"...
return
self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quote_newlines=quote_newlines), out)
self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
def _test_quote_output_all(self, enc):
def check(inp, out, optional_quotes=False, quote_newlines=None):
self._check(inp, out, enc, optional_quotes, quote_newlines)
# optional single quotes
check("foo", "'foo'", True)
check("\\", "'\\'", True)
check("$\"`", "'$\"`'", True)
check("\n", "'\n'", True, quote_newlines=False)
check(b"foo", b"'foo'", True)
check(b"\\", b"'\\'", True)
check(b"$\"`", b"'$\"`'", True)
check(b"\n", b"'\n'", True, quote_newlines=False)
# mandatory single quotes
check("\"", "'\"'")
check(b"\"", b"'\"'")
# double quotes
check("'", "\"'\"")
check("\n", "\"\\x0a\"", quote_newlines=True)
check("\x00", "\"\\x00\"")
check(b"'", b"\"'\"")
check(b"\n", b"\"\\x0a\"", quote_newlines=True)
check(b"\x00", b"\"\\x00\"")
# invalid Unicode and astral planes
check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
check(u"\uD800\uDC00", "\"\\U00010000\"")
check(u"\uD800\uDC01", "\"\\U00010001\"")
check(u"\uD801\uDC00", "\"\\U00010400\"")
check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
check(u"\uFDD0\uFDEF", b"\"\\ufdd0\\ufdef\"")
check(u"\uDC00\uD800", b"\"\\udc00\\ud800\"")
check(u"\uDC00\uD800\uDC00", b"\"\\udc00\\U00010000\"")
check(u"\uD800\uDC00", b"\"\\U00010000\"")
check(u"\uD800\uDC01", b"\"\\U00010001\"")
check(u"\uD801\uDC00", b"\"\\U00010400\"")
check(u"\uDBFF\uDFFF", b"\"\\U0010ffff\"")
check(u"'\uDBFF\uDFFF", b"\"'\\U0010ffff\"")
check(u"\"\uDBFF\uDFFF", b"\"\\\"\\U0010ffff\"")
# invalid UTF-8
check("\xFF", "b\"\\xff\"")
check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
check(b"\xFF", b"b\"\\xff\"")
check(b"\x00\"$\\`\x80\xFF", b"b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
def test_quote_output_ascii(self, enc='ascii'):
def check(inp, out, optional_quotes=False, quote_newlines=None):
self._check(inp, out, enc, optional_quotes, quote_newlines)
self._test_quote_output_all(enc)
check(u"\u00D7", "\"\\xd7\"")
check(u"'\u00D7", "\"'\\xd7\"")
check(u"\"\u00D7", "\"\\\"\\xd7\"")
check(u"\u2621", "\"\\u2621\"")
check(u"'\u2621", "\"'\\u2621\"")
check(u"\"\u2621", "\"\\\"\\u2621\"")
check(u"\n", "'\n'", True, quote_newlines=False)
check(u"\n", "\"\\x0a\"", quote_newlines=True)
check(u"\u00D7", b"\"\\xd7\"")
check(u"'\u00D7", b"\"'\\xd7\"")
check(u"\"\u00D7", b"\"\\\"\\xd7\"")
check(u"\u2621", b"\"\\u2621\"")
check(u"'\u2621", b"\"'\\u2621\"")
check(u"\"\u2621", b"\"\\\"\\u2621\"")
check(u"\n", b"'\n'", True, quote_newlines=False)
check(u"\n", b"\"\\x0a\"", quote_newlines=True)
def test_quote_output_latin1(self, enc='latin1'):
def check(inp, out, optional_quotes=False, quote_newlines=None):
@ -411,43 +464,43 @@ def win32_other(win32, other):
class QuotePaths(ReallyEqualMixin, unittest.TestCase):
def test_quote_path(self):
self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), b"'foo/bar'")
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), b"'foo/bar'")
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), b"foo/bar")
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), b'"foo/\\x0abar"')
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), b'"foo/\\x0abar"')
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), b'"foo/\\x0abar"')
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'"))
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'"))
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
win32_other("C:\\foo", "\\\\?\\C:\\foo"))
win32_other(b"C:\\foo", b"\\\\?\\C:\\foo"))
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'"))
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'"))
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
win32_other(b"\\\\foo\\bar", b"\\\\?\\UNC\\foo\\bar"))
def test_quote_filepath(self):
foo_bar_fp = FilePath(win32_other(u'C:\\foo\\bar', u'/foo/bar'))
self.failUnlessReallyEqual(quote_filepath(foo_bar_fp),
win32_other("'C:\\foo\\bar'", "'/foo/bar'"))
win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'"))
self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=True),
win32_other("'C:\\foo\\bar'", "'/foo/bar'"))
win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'"))
self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=False),
win32_other("C:\\foo\\bar", "/foo/bar"))
win32_other(b"C:\\foo\\bar", b"/foo/bar"))
if sys.platform == "win32":
foo_longfp = FilePath(u'\\\\?\\C:\\foo')
self.failUnlessReallyEqual(quote_filepath(foo_longfp),
"'C:\\foo'")
b"'C:\\foo'")
self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=True),
"'C:\\foo'")
b"'C:\\foo'")
self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=False),
"C:\\foo")
b"C:\\foo")
class FilePaths(ReallyEqualMixin, unittest.TestCase):
@ -501,23 +554,23 @@ class FilePaths(ReallyEqualMixin, unittest.TestCase):
class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
argv = 'lumi\xc3\xa8re'
argv = b'lumi\xc3\xa8re'
platform = 'linux2'
filesystem_encoding = 'UTF-8'
io_encoding = 'UTF-8'
dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
dirlist = [b'test_file', b'\xc3\x84rtonwall.mp3', b'Blah blah.txt']
class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
argv = 'lumi\xe8re'
argv = b'lumi\xe8re'
platform = 'linux2'
filesystem_encoding = 'ISO-8859-1'
io_encoding = 'ISO-8859-1'
dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
dirlist = [b'test_file', b'Blah blah.txt', b'\xc4rtonwall.mp3']
class Windows(EncodingUtil, unittest.TestCase):
uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
argv = 'lumi\xc3\xa8re'
argv = b'lumi\xc3\xa8re'
platform = 'win32'
filesystem_encoding = 'mbcs'
io_encoding = 'utf-8'
@ -525,7 +578,7 @@ class Windows(EncodingUtil, unittest.TestCase):
class MacOSXLeopard(EncodingUtil, unittest.TestCase):
uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
output = 'lumi\xc3\xa8re'
output = b'lumi\xc3\xa8re'
platform = 'darwin'
filesystem_encoding = 'utf-8'
io_encoding = 'UTF-8'
@ -548,14 +601,14 @@ class OpenBSD(EncodingUtil, unittest.TestCase):
class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
def test_to_str(self):
self.failUnlessReallyEqual(to_str("foo"), "foo")
self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want?
self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
self.failUnlessReallyEqual(to_str(b"foo"), b"foo")
self.failUnlessReallyEqual(to_str(b"lumi\xc3\xa8re"), b"lumi\xc3\xa8re")
self.failUnlessReallyEqual(to_str(b"\xFF"), b"\xFF") # passes through invalid UTF-8 -- is this what we want?
self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), b"lumi\xc3\xa8re")
self.failUnlessReallyEqual(to_str(None), None)
def test_from_utf8_or_none(self):
self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
self.failUnlessReallyEqual(from_utf8_or_none(b"lumi\xc3\xa8re"), u"lumi\u00E8re")
self.failUnlessReallyEqual(from_utf8_or_none(None), None)
self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")
self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, b"\xFF")

View File

@ -41,6 +41,7 @@ PORTED_MODULES = [
"allmydata.util.deferredutil",
"allmydata.util.fileutil",
"allmydata.util.dictutil",
"allmydata.util.encodingutil",
"allmydata.util.gcutil",
"allmydata.util.hashutil",
"allmydata.util.humanreadable",
@ -64,6 +65,7 @@ PORTED_TEST_MODULES = [
"allmydata.test.test_crypto",
"allmydata.test.test_deferredutil",
"allmydata.test.test_dictutil",
"allmydata.test.test_encodingutil",
"allmydata.test.test_happiness",
"allmydata.test.test_hashtree",
"allmydata.test.test_hashutil",

View File

@ -1,9 +1,26 @@
"""
Functions used to convert inputs from whatever encoding used in the system to
unicode and back.
Ported to Python 3.
Once Python 2 support is dropped, most of this module will obsolete, since
Unicode is the default everywhere in Python 3.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from future.utils import PY2, PY3, native_str
if PY2:
# We omit str() because that seems too tricky to get right.
from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, max, min # noqa: F401
from past.builtins import unicode
import sys, os, re, locale
import unicodedata
from allmydata.util.assertutil import precondition, _assert
from twisted.python import usage
@ -62,13 +79,14 @@ def _reload():
check_encoding(io_encoding)
is_unicode_platform = sys.platform in ["win32", "darwin"]
is_unicode_platform = PY3 or sys.platform in ["win32", "darwin"]
# Despite the Unicode-mode FilePath support added to Twisted in
# <https://twistedmatrix.com/trac/ticket/7805>, we can't yet use
# Unicode-mode FilePaths with INotify on non-Windows platforms
# due to <https://twistedmatrix.com/trac/ticket/7928>.
use_unicode_filepath = sys.platform == "win32"
# due to <https://twistedmatrix.com/trac/ticket/7928>. Supposedly
# 7928 is fixed, though...
use_unicode_filepath = PY3 or sys.platform == "win32"
_reload()
@ -89,7 +107,10 @@ def argv_to_unicode(s):
"""
Decode given argv element to unicode. If this fails, raise a UsageError.
"""
precondition(isinstance(s, str), s)
if isinstance(s, unicode):
return s
precondition(isinstance(s, bytes), s)
try:
return unicode(s, io_encoding)
@ -114,18 +135,22 @@ def unicode_to_argv(s, mangle=False):
If the argument is to be passed to a different process, then the 'mangle' argument
should be true; on Windows, this uses a mangled encoding that will be reversed by
code in runner.py.
On Python 3, just return the string unchanged, since argv is unicode.
"""
precondition(isinstance(s, unicode), s)
if PY3:
return s
if mangle and sys.platform == "win32":
# This must be the same as 'mangle' in bin/tahoe-script.template.
return str(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
return bytes(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s), io_encoding)
else:
return s.encode(io_encoding)
def unicode_to_url(s):
"""
Encode an unicode object used in an URL.
Encode an unicode object used in an URL to bytes.
"""
# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
@ -134,19 +159,19 @@ def unicode_to_url(s):
#precondition(isinstance(s, unicode), s)
#return s.encode('utf-8')
def to_str(s):
if s is None or isinstance(s, str):
def to_str(s): # TODO rename to to_bytes
if s is None or isinstance(s, bytes):
return s
return s.encode('utf-8')
def from_utf8_or_none(s):
precondition(isinstance(s, str) or s is None, s)
precondition(isinstance(s, bytes) or s is None, s)
if s is None:
return s
return s.decode('utf-8')
PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$', re.DOTALL)
PRINTABLE_8BIT = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
def is_printable_ascii(s):
return PRINTABLE_ASCII.search(s) is not None
@ -160,14 +185,14 @@ def unicode_to_output(s):
try:
out = s.encode(io_encoding)
except (UnicodeEncodeError, UnicodeDecodeError):
raise UnicodeEncodeError(io_encoding, s, 0, 0,
"A string could not be encoded as %s for output to the terminal:\n%r" %
(io_encoding, repr(s)))
raise UnicodeEncodeError(native_str(io_encoding), s, 0, 0,
native_str("A string could not be encoded as %s for output to the terminal:\n%r" %
(io_encoding, repr(s))))
if PRINTABLE_8BIT.search(out) is None:
raise UnicodeEncodeError(io_encoding, s, 0, 0,
"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
(io_encoding, repr(s)))
raise UnicodeEncodeError(native_str(io_encoding), s, 0, 0,
native_str("A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
(io_encoding, repr(s))))
return out
@ -188,14 +213,17 @@ def _unicode_escape(m, quote_newlines):
else:
return u'\\x%02x' % (codepoint,)
def _str_escape(m, quote_newlines):
def _str_escape(m, quote_newlines): # TODO rename to _bytes_escape
"""
Takes a re match on bytes, the result is escaped bytes of group(0).
"""
c = m.group(0)
if c == '"' or c == '$' or c == '`' or c == '\\':
return '\\' + c
elif c == '\n' and not quote_newlines:
if c == b'"' or c == b'$' or c == b'`' or c == b'\\':
return b'\\' + c
elif c == b'\n' and not quote_newlines:
return c
else:
return '\\x%02x' % (ord(c),)
return b'\\x%02x' % (ord(c),)
MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
MUST_DOUBLE_QUOTE = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
@ -205,7 +233,7 @@ ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid sur
u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
re.DOTALL)
ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
ESCAPABLE_8BIT = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
"""
@ -221,32 +249,32 @@ def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
If not explicitly given, quote_newlines is True when quotemarks is True.
"""
precondition(isinstance(s, (str, unicode)), s)
precondition(isinstance(s, (bytes, unicode)), s)
if quote_newlines is None:
quote_newlines = quotemarks
if isinstance(s, str):
if isinstance(s, bytes):
try:
s = s.decode('utf-8')
except UnicodeDecodeError:
return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
if must_double_quote.search(s) is None:
try:
out = s.encode(encoding or io_encoding)
if quotemarks or out.startswith('"'):
return "'%s'" % (out,)
if quotemarks or out.startswith(b'"'):
return b"'%s'" % (out,)
else:
return out
except (UnicodeDecodeError, UnicodeEncodeError):
pass
escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
return b'"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
def quote_path(path, quotemarks=True):
return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
return quote_output(b"/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
def quote_local_unicode_path(path, quotemarks=True):
precondition(isinstance(path, unicode), path)
@ -275,7 +303,7 @@ def extend_filepath(fp, segments):
return fp
def to_filepath(path):
precondition(isinstance(path, unicode if use_unicode_filepath else basestring),
precondition(isinstance(path, unicode if use_unicode_filepath else (bytes, unicode)),
path=path)
if isinstance(path, unicode) and not use_unicode_filepath:
@ -290,7 +318,7 @@ def to_filepath(path):
return FilePath(path)
def _decode(s):
precondition(isinstance(s, basestring), s=s)
precondition(isinstance(s, (bytes, unicode)), s=s)
if isinstance(s, bytes):
return s.decode(filesystem_encoding)
@ -356,3 +384,9 @@ def listdir_unicode(path):
def listdir_filepath(fp):
return listdir_unicode(unicode_from_filepath(fp))
# 'x' at the end of a variable name indicates that it holds a Unicode string that may not
# be NFC-normalized.
def normalize(namex):
return unicodedata.normalize('NFC', namex)