mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2025-04-08 11:24:25 +00:00
Merge pull request #774 from tahoe-lafs/3376.encodingutil-python-3
Port encodingutil to Python 3, part 1 Fixes ticket:3376
This commit is contained in:
commit
59b5f8c0b0
0
newsfragments/3376.minor
Normal file
0
newsfragments/3376.minor
Normal file
@ -1,5 +1,5 @@
|
||||
"""Directory Node implementation."""
|
||||
import time, unicodedata
|
||||
import time
|
||||
|
||||
from zope.interface import implementer
|
||||
from twisted.internet import defer
|
||||
@ -18,7 +18,7 @@ from allmydata.check_results import DeepCheckResults, \
|
||||
DeepCheckAndRepairResults
|
||||
from allmydata.monitor import Monitor
|
||||
from allmydata.util import hashutil, base32, log
|
||||
from allmydata.util.encodingutil import quote_output
|
||||
from allmydata.util.encodingutil import quote_output, normalize
|
||||
from allmydata.util.assertutil import precondition
|
||||
from allmydata.util.netstring import netstring, split_netstring
|
||||
from allmydata.util.consumer import download_to_data
|
||||
@ -101,12 +101,6 @@ def update_metadata(metadata, new_metadata, now):
|
||||
return metadata
|
||||
|
||||
|
||||
# 'x' at the end of a variable name indicates that it holds a Unicode string that may not
|
||||
# be NFC-normalized.
|
||||
|
||||
def normalize(namex):
|
||||
return unicodedata.normalize('NFC', namex)
|
||||
|
||||
# TODO: {Deleter,MetadataSetter,Adder}.modify all start by unpacking the
|
||||
# contents and end by repacking them. It might be better to apply them to
|
||||
# the unpacked contents.
|
||||
|
@ -13,11 +13,17 @@ from future.utils import PY2
|
||||
if PY2:
|
||||
from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min # noqa: F401
|
||||
|
||||
from past.builtins import unicode
|
||||
|
||||
import os
|
||||
import time
|
||||
import signal
|
||||
|
||||
from twisted.internet import reactor
|
||||
from twisted.trial import unittest
|
||||
|
||||
from ..util.assertutil import precondition
|
||||
from ..util.encodingutil import unicode_platform, get_filesystem_encoding
|
||||
|
||||
|
||||
class TimezoneMixin(object):
|
||||
@ -65,3 +71,20 @@ class SignalMixin(object):
|
||||
if self.sigchldHandler:
|
||||
signal.signal(signal.SIGCHLD, self.sigchldHandler)
|
||||
return super(SignalMixin, self).tearDown()
|
||||
|
||||
|
||||
class ReallyEqualMixin(object):
|
||||
def failUnlessReallyEqual(self, a, b, msg=None):
|
||||
self.assertEqual(a, b, msg)
|
||||
self.assertEqual(type(a), type(b), "a :: %r, b :: %r, %r" % (a, b, msg))
|
||||
|
||||
|
||||
def skip_if_cannot_represent_filename(u):
|
||||
precondition(isinstance(u, unicode))
|
||||
|
||||
enc = get_filesystem_encoding()
|
||||
if not unicode_platform():
|
||||
try:
|
||||
u.encode(enc)
|
||||
except UnicodeEncodeError:
|
||||
raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
|
||||
|
@ -9,24 +9,16 @@ from twisted.python import failure
|
||||
from twisted.trial import unittest
|
||||
|
||||
from ..util.assertutil import precondition
|
||||
from allmydata.util.encodingutil import (unicode_platform, get_filesystem_encoding,
|
||||
|
||||
get_io_encoding)
|
||||
from allmydata.util.encodingutil import get_io_encoding
|
||||
from future.utils import PY2
|
||||
if PY2: # XXX this is a hack that makes some tests pass on Python3, remove
|
||||
# in the future
|
||||
from ..scripts import runner
|
||||
from .common_py3 import SignalMixin
|
||||
# Imported for backwards compatibility:
|
||||
from .common_py3 import (
|
||||
SignalMixin, skip_if_cannot_represent_filename, ReallyEqualMixin,
|
||||
)
|
||||
|
||||
def skip_if_cannot_represent_filename(u):
|
||||
precondition(isinstance(u, unicode))
|
||||
|
||||
enc = get_filesystem_encoding()
|
||||
if not unicode_platform():
|
||||
try:
|
||||
u.encode(enc)
|
||||
except UnicodeEncodeError:
|
||||
raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
|
||||
|
||||
def skip_if_cannot_represent_argv(u):
|
||||
precondition(isinstance(u, unicode))
|
||||
@ -87,12 +79,6 @@ def flip_one_bit(s, offset=0, size=None):
|
||||
return result
|
||||
|
||||
|
||||
class ReallyEqualMixin(object):
|
||||
def failUnlessReallyEqual(self, a, b, msg=None):
|
||||
self.assertEqual(a, b, msg)
|
||||
self.assertEqual(type(a), type(b), "a :: %r, b :: %r, %r" % (a, b, msg))
|
||||
|
||||
|
||||
class StallMixin(object):
|
||||
def stall(self, res=None, delay=1):
|
||||
d = defer.Deferred()
|
||||
@ -186,3 +172,11 @@ except ImportError:
|
||||
os.chmod(path, stat.S_IWRITE | stat.S_IEXEC | stat.S_IREAD)
|
||||
make_readonly = _make_readonly
|
||||
make_accessible = _make_accessible
|
||||
|
||||
|
||||
__all__ = [
|
||||
"make_readonly", "make_accessible", "TestMixin", "ShouldFailMixin",
|
||||
"StallMixin", "skip_if_cannot_represent_argv", "run_cli", "parse_cli",
|
||||
"DevNullDictionary", "insecurerandstr", "flip_bit", "flip_one_bit",
|
||||
"SignalMixin", "skip_if_cannot_represent_filename", "ReallyEqualMixin"
|
||||
]
|
||||
|
@ -1,4 +1,14 @@
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from future.utils import PY2, PY3
|
||||
if PY2:
|
||||
# We don't import str because omg way too ambiguous in this context.
|
||||
from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, max, min # noqa: F401
|
||||
|
||||
from past.builtins import unicode
|
||||
|
||||
lumiere_nfc = u"lumi\u00E8re"
|
||||
Artonwall_nfc = u"\u00C4rtonwall.mp3"
|
||||
@ -43,8 +53,10 @@ if __name__ == "__main__":
|
||||
for fname in TEST_FILENAMES:
|
||||
open(os.path.join(tmpdir, fname), 'w').close()
|
||||
|
||||
# Use Unicode API under Windows or MacOS X
|
||||
if sys.platform in ('win32', 'darwin'):
|
||||
# On Python 2, listing directories returns unicode under Windows or
|
||||
# MacOS X if the input is unicode. On Python 3, it always returns
|
||||
# Unicode.
|
||||
if PY2 and sys.platform in ('win32', 'darwin'):
|
||||
dirlist = os.listdir(unicode(tmpdir))
|
||||
else:
|
||||
dirlist = os.listdir(tmpdir)
|
||||
@ -59,20 +71,22 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
import os, sys, locale
|
||||
from unittest import skipIf
|
||||
|
||||
from twisted.trial import unittest
|
||||
|
||||
from twisted.python.filepath import FilePath
|
||||
|
||||
from allmydata.test.common_util import ReallyEqualMixin
|
||||
from allmydata.test.common_py3 import (
|
||||
ReallyEqualMixin, skip_if_cannot_represent_filename,
|
||||
)
|
||||
from allmydata.util import encodingutil, fileutil
|
||||
from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
|
||||
unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
|
||||
quote_filepath, unicode_platform, listdir_unicode, FilenameEncodingError, \
|
||||
get_io_encoding, get_filesystem_encoding, to_str, from_utf8_or_none, _reload, \
|
||||
to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from
|
||||
from allmydata.dirnode import normalize
|
||||
from .common_util import skip_if_cannot_represent_filename
|
||||
to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from, \
|
||||
unicode_to_argv
|
||||
from twisted.python import usage
|
||||
|
||||
|
||||
@ -90,7 +104,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
|
||||
|
||||
mock_stdout.encoding = 'cp65001'
|
||||
_reload()
|
||||
self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
|
||||
self.assertEqual(get_io_encoding(), 'utf-8')
|
||||
|
||||
mock_stdout.encoding = 'koi8-r'
|
||||
expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
|
||||
@ -122,7 +136,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
|
||||
|
||||
preferredencoding = None
|
||||
_reload()
|
||||
self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
|
||||
self.assertEqual(get_io_encoding(), 'utf-8')
|
||||
|
||||
def test_argv_to_unicode(self):
|
||||
encodingutil.io_encoding = 'utf-8'
|
||||
@ -150,6 +164,7 @@ class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
|
||||
# The following tests apply only to platforms that don't store filenames as
|
||||
# Unicode entities on the filesystem.
|
||||
class EncodingUtilNonUnicodePlatform(unittest.TestCase):
|
||||
@skipIf(PY3, "Python 3 is always Unicode, regardless of OS.")
|
||||
def setUp(self):
|
||||
# Mock sys.platform because unicode_platform() uses it
|
||||
self.original_platform = sys.platform
|
||||
@ -211,7 +226,7 @@ class EncodingUtil(ReallyEqualMixin):
|
||||
self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
|
||||
|
||||
def test_unicode_to_url(self):
|
||||
self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
|
||||
self.failUnless(unicode_to_url(lumiere_nfc), b"lumi\xc3\xa8re")
|
||||
|
||||
def test_unicode_to_output(self):
|
||||
if 'argv' not in dir(self):
|
||||
@ -224,7 +239,18 @@ class EncodingUtil(ReallyEqualMixin):
|
||||
_reload()
|
||||
self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
|
||||
|
||||
def test_unicode_platform(self):
|
||||
@skipIf(PY3, "Python 2 only.")
|
||||
def test_unicode_to_argv_py2(self):
|
||||
"""unicode_to_argv() converts to bytes on Python 2."""
|
||||
self.assertEqual(unicode_to_argv("abc"), u"abc".encode(self.io_encoding))
|
||||
|
||||
@skipIf(PY2, "Python 3 only.")
|
||||
def test_unicode_to_argv_py3(self):
|
||||
"""unicode_to_argv() is noop on Python 3."""
|
||||
self.assertEqual(unicode_to_argv("abc"), "abc")
|
||||
|
||||
@skipIf(PY3, "Python 3 only.")
|
||||
def test_unicode_platform_py2(self):
|
||||
matrix = {
|
||||
'linux2': False,
|
||||
'linux3': False,
|
||||
@ -236,6 +262,11 @@ class EncodingUtil(ReallyEqualMixin):
|
||||
_reload()
|
||||
self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
|
||||
|
||||
@skipIf(PY2, "Python 3 isn't Python 2.")
|
||||
def test_unicode_platform_py3(self):
|
||||
_reload()
|
||||
self.failUnlessReallyEqual(unicode_platform(), True)
|
||||
|
||||
def test_listdir_unicode(self):
|
||||
if 'dirlist' not in dir(self):
|
||||
return
|
||||
@ -248,7 +279,14 @@ class EncodingUtil(ReallyEqualMixin):
|
||||
% (self.filesystem_encoding,))
|
||||
|
||||
def call_os_listdir(path):
|
||||
return self.dirlist
|
||||
if PY2:
|
||||
return self.dirlist
|
||||
else:
|
||||
# Python 3 always lists unicode filenames:
|
||||
return [d.decode(self.filesystem_encoding) if isinstance(d, bytes)
|
||||
else d
|
||||
for d in self.dirlist]
|
||||
|
||||
self.patch(os, 'listdir', call_os_listdir)
|
||||
|
||||
def call_sys_getfilesystemencoding():
|
||||
@ -258,7 +296,7 @@ class EncodingUtil(ReallyEqualMixin):
|
||||
_reload()
|
||||
filenames = listdir_unicode(u'/dummy')
|
||||
|
||||
self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
|
||||
self.failUnlessEqual(set([encodingutil.normalize(fname) for fname in filenames]),
|
||||
set(TEST_FILENAMES))
|
||||
|
||||
|
||||
@ -278,12 +316,16 @@ class StdlibUnicode(unittest.TestCase):
|
||||
fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
|
||||
open(fn, 'wb').close()
|
||||
self.failUnless(os.path.exists(fn))
|
||||
self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
|
||||
if PY2:
|
||||
getcwdu = os.getcwdu
|
||||
else:
|
||||
getcwdu = os.getcwd
|
||||
self.failUnless(os.path.exists(os.path.join(getcwdu(), fn)))
|
||||
filenames = listdir_unicode(lumiere_nfc)
|
||||
|
||||
# We only require that the listing includes a filename that is canonically equivalent
|
||||
# to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
|
||||
self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
|
||||
self.failUnlessIn(lumiere_nfc + u".txt", set([encodingutil.normalize(fname) for fname in filenames]))
|
||||
|
||||
expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
|
||||
self.failIfIn(u"~", expanded)
|
||||
@ -314,59 +356,70 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
|
||||
self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
|
||||
if out[0:2] == 'b"':
|
||||
pass
|
||||
elif isinstance(inp, str):
|
||||
self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
|
||||
self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
|
||||
elif isinstance(inp, bytes):
|
||||
try:
|
||||
unicode_inp = inp.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
# Some things decode on Python 2, but not Python 3...
|
||||
return
|
||||
self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quote_newlines=quote_newlines), out)
|
||||
self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
|
||||
else:
|
||||
self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
|
||||
self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
|
||||
try:
|
||||
bytes_inp = inp.encode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
# Some things encode on Python 2, but not Python 3, e.g.
|
||||
# surrogates like u"\uDC00\uD800"...
|
||||
return
|
||||
self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quote_newlines=quote_newlines), out)
|
||||
self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
|
||||
|
||||
def _test_quote_output_all(self, enc):
|
||||
def check(inp, out, optional_quotes=False, quote_newlines=None):
|
||||
self._check(inp, out, enc, optional_quotes, quote_newlines)
|
||||
|
||||
# optional single quotes
|
||||
check("foo", "'foo'", True)
|
||||
check("\\", "'\\'", True)
|
||||
check("$\"`", "'$\"`'", True)
|
||||
check("\n", "'\n'", True, quote_newlines=False)
|
||||
check(b"foo", b"'foo'", True)
|
||||
check(b"\\", b"'\\'", True)
|
||||
check(b"$\"`", b"'$\"`'", True)
|
||||
check(b"\n", b"'\n'", True, quote_newlines=False)
|
||||
|
||||
# mandatory single quotes
|
||||
check("\"", "'\"'")
|
||||
check(b"\"", b"'\"'")
|
||||
|
||||
# double quotes
|
||||
check("'", "\"'\"")
|
||||
check("\n", "\"\\x0a\"", quote_newlines=True)
|
||||
check("\x00", "\"\\x00\"")
|
||||
check(b"'", b"\"'\"")
|
||||
check(b"\n", b"\"\\x0a\"", quote_newlines=True)
|
||||
check(b"\x00", b"\"\\x00\"")
|
||||
|
||||
# invalid Unicode and astral planes
|
||||
check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
|
||||
check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
|
||||
check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
|
||||
check(u"\uD800\uDC00", "\"\\U00010000\"")
|
||||
check(u"\uD800\uDC01", "\"\\U00010001\"")
|
||||
check(u"\uD801\uDC00", "\"\\U00010400\"")
|
||||
check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
|
||||
check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
|
||||
check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
|
||||
check(u"\uFDD0\uFDEF", b"\"\\ufdd0\\ufdef\"")
|
||||
check(u"\uDC00\uD800", b"\"\\udc00\\ud800\"")
|
||||
check(u"\uDC00\uD800\uDC00", b"\"\\udc00\\U00010000\"")
|
||||
check(u"\uD800\uDC00", b"\"\\U00010000\"")
|
||||
check(u"\uD800\uDC01", b"\"\\U00010001\"")
|
||||
check(u"\uD801\uDC00", b"\"\\U00010400\"")
|
||||
check(u"\uDBFF\uDFFF", b"\"\\U0010ffff\"")
|
||||
check(u"'\uDBFF\uDFFF", b"\"'\\U0010ffff\"")
|
||||
check(u"\"\uDBFF\uDFFF", b"\"\\\"\\U0010ffff\"")
|
||||
|
||||
# invalid UTF-8
|
||||
check("\xFF", "b\"\\xff\"")
|
||||
check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
|
||||
check(b"\xFF", b"b\"\\xff\"")
|
||||
check(b"\x00\"$\\`\x80\xFF", b"b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
|
||||
|
||||
def test_quote_output_ascii(self, enc='ascii'):
|
||||
def check(inp, out, optional_quotes=False, quote_newlines=None):
|
||||
self._check(inp, out, enc, optional_quotes, quote_newlines)
|
||||
|
||||
self._test_quote_output_all(enc)
|
||||
check(u"\u00D7", "\"\\xd7\"")
|
||||
check(u"'\u00D7", "\"'\\xd7\"")
|
||||
check(u"\"\u00D7", "\"\\\"\\xd7\"")
|
||||
check(u"\u2621", "\"\\u2621\"")
|
||||
check(u"'\u2621", "\"'\\u2621\"")
|
||||
check(u"\"\u2621", "\"\\\"\\u2621\"")
|
||||
check(u"\n", "'\n'", True, quote_newlines=False)
|
||||
check(u"\n", "\"\\x0a\"", quote_newlines=True)
|
||||
check(u"\u00D7", b"\"\\xd7\"")
|
||||
check(u"'\u00D7", b"\"'\\xd7\"")
|
||||
check(u"\"\u00D7", b"\"\\\"\\xd7\"")
|
||||
check(u"\u2621", b"\"\\u2621\"")
|
||||
check(u"'\u2621", b"\"'\\u2621\"")
|
||||
check(u"\"\u2621", b"\"\\\"\\u2621\"")
|
||||
check(u"\n", b"'\n'", True, quote_newlines=False)
|
||||
check(u"\n", b"\"\\x0a\"", quote_newlines=True)
|
||||
|
||||
def test_quote_output_latin1(self, enc='latin1'):
|
||||
def check(inp, out, optional_quotes=False, quote_newlines=None):
|
||||
@ -411,43 +464,43 @@ def win32_other(win32, other):
|
||||
|
||||
class QuotePaths(ReallyEqualMixin, unittest.TestCase):
|
||||
def test_quote_path(self):
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), b"'foo/bar'")
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), b"'foo/bar'")
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), b"foo/bar")
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), b'"foo/\\x0abar"')
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), b'"foo/\\x0abar"')
|
||||
self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), b'"foo/\\x0abar"')
|
||||
|
||||
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
|
||||
win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
|
||||
win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'"))
|
||||
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
|
||||
win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
|
||||
win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'"))
|
||||
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
|
||||
win32_other("C:\\foo", "\\\\?\\C:\\foo"))
|
||||
win32_other(b"C:\\foo", b"\\\\?\\C:\\foo"))
|
||||
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
|
||||
win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
|
||||
win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'"))
|
||||
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
|
||||
win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
|
||||
win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'"))
|
||||
self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
|
||||
win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
|
||||
win32_other(b"\\\\foo\\bar", b"\\\\?\\UNC\\foo\\bar"))
|
||||
|
||||
def test_quote_filepath(self):
|
||||
foo_bar_fp = FilePath(win32_other(u'C:\\foo\\bar', u'/foo/bar'))
|
||||
self.failUnlessReallyEqual(quote_filepath(foo_bar_fp),
|
||||
win32_other("'C:\\foo\\bar'", "'/foo/bar'"))
|
||||
win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'"))
|
||||
self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=True),
|
||||
win32_other("'C:\\foo\\bar'", "'/foo/bar'"))
|
||||
win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'"))
|
||||
self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=False),
|
||||
win32_other("C:\\foo\\bar", "/foo/bar"))
|
||||
win32_other(b"C:\\foo\\bar", b"/foo/bar"))
|
||||
|
||||
if sys.platform == "win32":
|
||||
foo_longfp = FilePath(u'\\\\?\\C:\\foo')
|
||||
self.failUnlessReallyEqual(quote_filepath(foo_longfp),
|
||||
"'C:\\foo'")
|
||||
b"'C:\\foo'")
|
||||
self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=True),
|
||||
"'C:\\foo'")
|
||||
b"'C:\\foo'")
|
||||
self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=False),
|
||||
"C:\\foo")
|
||||
b"C:\\foo")
|
||||
|
||||
|
||||
class FilePaths(ReallyEqualMixin, unittest.TestCase):
|
||||
@ -501,23 +554,23 @@ class FilePaths(ReallyEqualMixin, unittest.TestCase):
|
||||
|
||||
class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
|
||||
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
|
||||
argv = 'lumi\xc3\xa8re'
|
||||
argv = b'lumi\xc3\xa8re'
|
||||
platform = 'linux2'
|
||||
filesystem_encoding = 'UTF-8'
|
||||
io_encoding = 'UTF-8'
|
||||
dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
|
||||
dirlist = [b'test_file', b'\xc3\x84rtonwall.mp3', b'Blah blah.txt']
|
||||
|
||||
class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
|
||||
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
|
||||
argv = 'lumi\xe8re'
|
||||
argv = b'lumi\xe8re'
|
||||
platform = 'linux2'
|
||||
filesystem_encoding = 'ISO-8859-1'
|
||||
io_encoding = 'ISO-8859-1'
|
||||
dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
|
||||
dirlist = [b'test_file', b'Blah blah.txt', b'\xc4rtonwall.mp3']
|
||||
|
||||
class Windows(EncodingUtil, unittest.TestCase):
|
||||
uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
|
||||
argv = 'lumi\xc3\xa8re'
|
||||
argv = b'lumi\xc3\xa8re'
|
||||
platform = 'win32'
|
||||
filesystem_encoding = 'mbcs'
|
||||
io_encoding = 'utf-8'
|
||||
@ -525,7 +578,7 @@ class Windows(EncodingUtil, unittest.TestCase):
|
||||
|
||||
class MacOSXLeopard(EncodingUtil, unittest.TestCase):
|
||||
uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
|
||||
output = 'lumi\xc3\xa8re'
|
||||
output = b'lumi\xc3\xa8re'
|
||||
platform = 'darwin'
|
||||
filesystem_encoding = 'utf-8'
|
||||
io_encoding = 'UTF-8'
|
||||
@ -548,14 +601,14 @@ class OpenBSD(EncodingUtil, unittest.TestCase):
|
||||
|
||||
class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
|
||||
def test_to_str(self):
|
||||
self.failUnlessReallyEqual(to_str("foo"), "foo")
|
||||
self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
|
||||
self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want?
|
||||
self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
|
||||
self.failUnlessReallyEqual(to_str(b"foo"), b"foo")
|
||||
self.failUnlessReallyEqual(to_str(b"lumi\xc3\xa8re"), b"lumi\xc3\xa8re")
|
||||
self.failUnlessReallyEqual(to_str(b"\xFF"), b"\xFF") # passes through invalid UTF-8 -- is this what we want?
|
||||
self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), b"lumi\xc3\xa8re")
|
||||
self.failUnlessReallyEqual(to_str(None), None)
|
||||
|
||||
def test_from_utf8_or_none(self):
|
||||
self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
|
||||
self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
|
||||
self.failUnlessReallyEqual(from_utf8_or_none(b"lumi\xc3\xa8re"), u"lumi\u00E8re")
|
||||
self.failUnlessReallyEqual(from_utf8_or_none(None), None)
|
||||
self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")
|
||||
self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, b"\xFF")
|
||||
|
@ -41,6 +41,7 @@ PORTED_MODULES = [
|
||||
"allmydata.util.deferredutil",
|
||||
"allmydata.util.fileutil",
|
||||
"allmydata.util.dictutil",
|
||||
"allmydata.util.encodingutil",
|
||||
"allmydata.util.gcutil",
|
||||
"allmydata.util.hashutil",
|
||||
"allmydata.util.humanreadable",
|
||||
@ -64,6 +65,7 @@ PORTED_TEST_MODULES = [
|
||||
"allmydata.test.test_crypto",
|
||||
"allmydata.test.test_deferredutil",
|
||||
"allmydata.test.test_dictutil",
|
||||
"allmydata.test.test_encodingutil",
|
||||
"allmydata.test.test_happiness",
|
||||
"allmydata.test.test_hashtree",
|
||||
"allmydata.test.test_hashutil",
|
||||
|
@ -1,9 +1,26 @@
|
||||
"""
|
||||
Functions used to convert inputs from whatever encoding used in the system to
|
||||
unicode and back.
|
||||
|
||||
Ported to Python 3.
|
||||
|
||||
Once Python 2 support is dropped, most of this module will obsolete, since
|
||||
Unicode is the default everywhere in Python 3.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from future.utils import PY2, PY3, native_str
|
||||
if PY2:
|
||||
# We omit str() because that seems too tricky to get right.
|
||||
from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, max, min # noqa: F401
|
||||
|
||||
from past.builtins import unicode
|
||||
|
||||
import sys, os, re, locale
|
||||
import unicodedata
|
||||
|
||||
from allmydata.util.assertutil import precondition, _assert
|
||||
from twisted.python import usage
|
||||
@ -62,13 +79,14 @@ def _reload():
|
||||
|
||||
check_encoding(io_encoding)
|
||||
|
||||
is_unicode_platform = sys.platform in ["win32", "darwin"]
|
||||
is_unicode_platform = PY3 or sys.platform in ["win32", "darwin"]
|
||||
|
||||
# Despite the Unicode-mode FilePath support added to Twisted in
|
||||
# <https://twistedmatrix.com/trac/ticket/7805>, we can't yet use
|
||||
# Unicode-mode FilePaths with INotify on non-Windows platforms
|
||||
# due to <https://twistedmatrix.com/trac/ticket/7928>.
|
||||
use_unicode_filepath = sys.platform == "win32"
|
||||
# due to <https://twistedmatrix.com/trac/ticket/7928>. Supposedly
|
||||
# 7928 is fixed, though...
|
||||
use_unicode_filepath = PY3 or sys.platform == "win32"
|
||||
|
||||
_reload()
|
||||
|
||||
@ -89,7 +107,10 @@ def argv_to_unicode(s):
|
||||
"""
|
||||
Decode given argv element to unicode. If this fails, raise a UsageError.
|
||||
"""
|
||||
precondition(isinstance(s, str), s)
|
||||
if isinstance(s, unicode):
|
||||
return s
|
||||
|
||||
precondition(isinstance(s, bytes), s)
|
||||
|
||||
try:
|
||||
return unicode(s, io_encoding)
|
||||
@ -114,18 +135,22 @@ def unicode_to_argv(s, mangle=False):
|
||||
If the argument is to be passed to a different process, then the 'mangle' argument
|
||||
should be true; on Windows, this uses a mangled encoding that will be reversed by
|
||||
code in runner.py.
|
||||
|
||||
On Python 3, just return the string unchanged, since argv is unicode.
|
||||
"""
|
||||
precondition(isinstance(s, unicode), s)
|
||||
if PY3:
|
||||
return s
|
||||
|
||||
if mangle and sys.platform == "win32":
|
||||
# This must be the same as 'mangle' in bin/tahoe-script.template.
|
||||
return str(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
|
||||
return bytes(re.sub(u'[^\\x20-\\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s), io_encoding)
|
||||
else:
|
||||
return s.encode(io_encoding)
|
||||
|
||||
def unicode_to_url(s):
|
||||
"""
|
||||
Encode an unicode object used in an URL.
|
||||
Encode an unicode object used in an URL to bytes.
|
||||
"""
|
||||
# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
|
||||
|
||||
@ -134,19 +159,19 @@ def unicode_to_url(s):
|
||||
#precondition(isinstance(s, unicode), s)
|
||||
#return s.encode('utf-8')
|
||||
|
||||
def to_str(s):
|
||||
if s is None or isinstance(s, str):
|
||||
def to_str(s): # TODO rename to to_bytes
|
||||
if s is None or isinstance(s, bytes):
|
||||
return s
|
||||
return s.encode('utf-8')
|
||||
|
||||
def from_utf8_or_none(s):
|
||||
precondition(isinstance(s, str) or s is None, s)
|
||||
precondition(isinstance(s, bytes) or s is None, s)
|
||||
if s is None:
|
||||
return s
|
||||
return s.decode('utf-8')
|
||||
|
||||
PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
|
||||
PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
|
||||
PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$', re.DOTALL)
|
||||
PRINTABLE_8BIT = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
|
||||
|
||||
def is_printable_ascii(s):
|
||||
return PRINTABLE_ASCII.search(s) is not None
|
||||
@ -160,14 +185,14 @@ def unicode_to_output(s):
|
||||
try:
|
||||
out = s.encode(io_encoding)
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
raise UnicodeEncodeError(io_encoding, s, 0, 0,
|
||||
"A string could not be encoded as %s for output to the terminal:\n%r" %
|
||||
(io_encoding, repr(s)))
|
||||
raise UnicodeEncodeError(native_str(io_encoding), s, 0, 0,
|
||||
native_str("A string could not be encoded as %s for output to the terminal:\n%r" %
|
||||
(io_encoding, repr(s))))
|
||||
|
||||
if PRINTABLE_8BIT.search(out) is None:
|
||||
raise UnicodeEncodeError(io_encoding, s, 0, 0,
|
||||
"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
|
||||
(io_encoding, repr(s)))
|
||||
raise UnicodeEncodeError(native_str(io_encoding), s, 0, 0,
|
||||
native_str("A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
|
||||
(io_encoding, repr(s))))
|
||||
return out
|
||||
|
||||
|
||||
@ -188,14 +213,17 @@ def _unicode_escape(m, quote_newlines):
|
||||
else:
|
||||
return u'\\x%02x' % (codepoint,)
|
||||
|
||||
def _str_escape(m, quote_newlines):
|
||||
def _str_escape(m, quote_newlines): # TODO rename to _bytes_escape
|
||||
"""
|
||||
Takes a re match on bytes, the result is escaped bytes of group(0).
|
||||
"""
|
||||
c = m.group(0)
|
||||
if c == '"' or c == '$' or c == '`' or c == '\\':
|
||||
return '\\' + c
|
||||
elif c == '\n' and not quote_newlines:
|
||||
if c == b'"' or c == b'$' or c == b'`' or c == b'\\':
|
||||
return b'\\' + c
|
||||
elif c == b'\n' and not quote_newlines:
|
||||
return c
|
||||
else:
|
||||
return '\\x%02x' % (ord(c),)
|
||||
return b'\\x%02x' % (ord(c),)
|
||||
|
||||
MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
|
||||
MUST_DOUBLE_QUOTE = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
|
||||
@ -205,7 +233,7 @@ ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid sur
|
||||
u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
|
||||
re.DOTALL)
|
||||
|
||||
ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
|
||||
ESCAPABLE_8BIT = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
|
||||
|
||||
def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
|
||||
"""
|
||||
@ -221,32 +249,32 @@ def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
|
||||
|
||||
If not explicitly given, quote_newlines is True when quotemarks is True.
|
||||
"""
|
||||
precondition(isinstance(s, (str, unicode)), s)
|
||||
precondition(isinstance(s, (bytes, unicode)), s)
|
||||
if quote_newlines is None:
|
||||
quote_newlines = quotemarks
|
||||
|
||||
if isinstance(s, str):
|
||||
if isinstance(s, bytes):
|
||||
try:
|
||||
s = s.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
|
||||
return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
|
||||
|
||||
must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
|
||||
if must_double_quote.search(s) is None:
|
||||
try:
|
||||
out = s.encode(encoding or io_encoding)
|
||||
if quotemarks or out.startswith('"'):
|
||||
return "'%s'" % (out,)
|
||||
if quotemarks or out.startswith(b'"'):
|
||||
return b"'%s'" % (out,)
|
||||
else:
|
||||
return out
|
||||
except (UnicodeDecodeError, UnicodeEncodeError):
|
||||
pass
|
||||
|
||||
escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
|
||||
return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
|
||||
return b'"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
|
||||
|
||||
def quote_path(path, quotemarks=True):
|
||||
return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
|
||||
return quote_output(b"/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
|
||||
|
||||
def quote_local_unicode_path(path, quotemarks=True):
|
||||
precondition(isinstance(path, unicode), path)
|
||||
@ -275,7 +303,7 @@ def extend_filepath(fp, segments):
|
||||
return fp
|
||||
|
||||
def to_filepath(path):
|
||||
precondition(isinstance(path, unicode if use_unicode_filepath else basestring),
|
||||
precondition(isinstance(path, unicode if use_unicode_filepath else (bytes, unicode)),
|
||||
path=path)
|
||||
|
||||
if isinstance(path, unicode) and not use_unicode_filepath:
|
||||
@ -290,7 +318,7 @@ def to_filepath(path):
|
||||
return FilePath(path)
|
||||
|
||||
def _decode(s):
|
||||
precondition(isinstance(s, basestring), s=s)
|
||||
precondition(isinstance(s, (bytes, unicode)), s=s)
|
||||
|
||||
if isinstance(s, bytes):
|
||||
return s.decode(filesystem_encoding)
|
||||
@ -356,3 +384,9 @@ def listdir_unicode(path):
|
||||
|
||||
def listdir_filepath(fp):
|
||||
return listdir_unicode(unicode_from_filepath(fp))
|
||||
|
||||
|
||||
# 'x' at the end of a variable name indicates that it holds a Unicode string that may not
|
||||
# be NFC-normalized.
|
||||
def normalize(namex):
|
||||
return unicodedata.normalize('NFC', namex)
|
||||
|
Loading…
x
Reference in New Issue
Block a user