2010-05-20 00:41:05 +00:00
|
|
|
"""
|
|
|
|
Functions used to convert inputs from whatever encoding used in the system to
|
|
|
|
unicode and back.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
2010-06-07 01:02:15 +00:00
|
|
|
import re
|
2010-05-20 00:41:05 +00:00
|
|
|
import unicodedata
|
|
|
|
from allmydata.util.assertutil import precondition
|
|
|
|
from twisted.python import usage
|
2010-06-04 14:12:51 +00:00
|
|
|
import locale
|
2010-05-20 00:41:05 +00:00
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
|
|
|
|
def _canonical_encoding(encoding):
|
|
|
|
if encoding is None:
|
|
|
|
encoding = 'utf-8'
|
|
|
|
encoding = encoding.lower()
|
|
|
|
if encoding == "cp65001":
|
|
|
|
encoding = 'utf-8'
|
|
|
|
elif encoding == "us-ascii" or encoding == "646":
|
|
|
|
encoding = 'ascii'
|
|
|
|
|
|
|
|
# sometimes Python returns an encoding name that it doesn't support for conversion
|
|
|
|
# fail early if this happens
|
|
|
|
try:
|
|
|
|
u"test".encode(encoding)
|
2010-06-09 01:33:02 +00:00
|
|
|
except (LookupError, AttributeError):
|
2010-06-07 01:02:15 +00:00
|
|
|
raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
|
|
|
|
|
|
|
|
return encoding
|
|
|
|
|
|
|
|
filesystem_encoding = None
|
|
|
|
output_encoding = None
|
|
|
|
argv_encoding = None
|
|
|
|
is_unicode_platform = False
|
|
|
|
|
|
|
|
def _reload():
|
|
|
|
global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
|
|
|
|
|
|
|
|
filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
|
|
|
|
output_encoding = _canonical_encoding(sys.stdout.encoding or locale.getpreferredencoding())
|
|
|
|
if sys.platform == 'win32':
|
2010-06-09 00:08:03 +00:00
|
|
|
# Unicode arguments are not supported on Windows yet; see #565 and #1074.
|
|
|
|
argv_encoding = 'ascii'
|
2010-06-07 01:02:15 +00:00
|
|
|
else:
|
|
|
|
argv_encoding = output_encoding
|
|
|
|
is_unicode_platform = sys.platform in ["win32", "darwin"]
|
|
|
|
|
|
|
|
_reload()
|
|
|
|
|
|
|
|
|
|
|
|
def get_filesystem_encoding():
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
Returns expected encoding for local filenames.
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
return filesystem_encoding
|
2010-05-20 00:41:05 +00:00
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def get_output_encoding():
|
|
|
|
"""
|
|
|
|
Returns expected encoding for writing to stdout or stderr.
|
|
|
|
"""
|
|
|
|
return output_encoding
|
2010-05-20 00:41:05 +00:00
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def get_argv_encoding():
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
Returns expected encoding for command-line arguments.
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
return argv_encoding
|
2010-05-20 00:41:05 +00:00
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def argv_to_unicode(s):
|
|
|
|
"""
|
|
|
|
Decode given argv element to unicode. If this fails, raise a UsageError.
|
|
|
|
"""
|
2010-05-20 00:41:05 +00:00
|
|
|
precondition(isinstance(s, str), s)
|
|
|
|
|
|
|
|
try:
|
2010-06-07 01:02:15 +00:00
|
|
|
return unicode(s, argv_encoding)
|
2010-05-20 00:41:05 +00:00
|
|
|
except UnicodeDecodeError:
|
2010-06-07 01:02:15 +00:00
|
|
|
raise usage.UsageError("Argument %s cannot be decoded as %s." %
|
|
|
|
(quote_output(s), argv_encoding))
|
2010-05-20 00:41:05 +00:00
|
|
|
|
|
|
|
def unicode_to_url(s):
|
|
|
|
"""
|
|
|
|
Encode an unicode object used in an URL.
|
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
|
2010-05-20 00:41:05 +00:00
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
# FIXME
|
|
|
|
return to_str(s)
|
|
|
|
#precondition(isinstance(s, unicode), s)
|
|
|
|
#return s.encode('utf-8')
|
|
|
|
|
|
|
|
def to_str(s):
|
|
|
|
if s is None or isinstance(s, str):
|
|
|
|
return s
|
2010-05-20 00:41:05 +00:00
|
|
|
return s.encode('utf-8')
|
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def to_argv(s):
|
|
|
|
if isinstance(s, str):
|
|
|
|
return s
|
|
|
|
return s.encode(argv_encoding)
|
|
|
|
|
|
|
|
PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
|
|
|
|
PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
|
|
|
|
|
|
|
|
def is_printable_ascii(s):
|
|
|
|
return PRINTABLE_ASCII.search(s) is not None
|
|
|
|
|
|
|
|
def unicode_to_output(s):
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
Encode an unicode object for representation on stdout or stderr.
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
|
|
|
precondition(isinstance(s, unicode), s)
|
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
try:
|
|
|
|
out = s.encode(output_encoding)
|
2010-06-09 05:54:48 +00:00
|
|
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
2010-06-07 01:02:15 +00:00
|
|
|
raise UnicodeEncodeError(output_encoding, s, 0, 0,
|
|
|
|
"A string could not be encoded as %s for output to the terminal:\n%r" %
|
|
|
|
(output_encoding, repr(s)))
|
|
|
|
|
|
|
|
if PRINTABLE_8BIT.search(out) is None:
|
|
|
|
raise UnicodeEncodeError(output_encoding, s, 0, 0,
|
|
|
|
"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
|
|
|
|
(output_encoding, repr(s)))
|
|
|
|
return out
|
|
|
|
|
|
|
|
def quote_output(s, quotemarks=True):
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
Encode either a Unicode string or a UTF-8-encoded bytestring for representation
|
|
|
|
on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
|
|
|
|
always surrounded by single quotes; otherwise, it is quoted only if necessary to
|
|
|
|
avoid ambiguity or control bytes in the output.
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
precondition(isinstance(s, (str, unicode)), s)
|
|
|
|
|
|
|
|
if isinstance(s, str):
|
|
|
|
try:
|
|
|
|
s = s.decode('utf-8')
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
return 'b' + repr(s)
|
|
|
|
|
|
|
|
try:
|
|
|
|
out = s.encode(output_encoding)
|
2010-06-09 05:54:48 +00:00
|
|
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
2010-06-07 01:02:15 +00:00
|
|
|
return repr(s)
|
|
|
|
|
|
|
|
if PRINTABLE_8BIT.search(out) is None:
|
|
|
|
return repr(out)
|
2010-05-20 00:41:05 +00:00
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
if quotemarks:
|
|
|
|
return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
|
|
|
|
else:
|
|
|
|
return out
|
|
|
|
|
|
|
|
def quote_path(path, quotemarks=True):
|
|
|
|
return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
|
|
|
|
|
|
|
|
|
|
|
|
def unicode_platform():
|
|
|
|
"""
|
|
|
|
Does the current platform handle Unicode filenames natively?
|
|
|
|
"""
|
|
|
|
return is_unicode_platform
|
2010-05-20 00:41:05 +00:00
|
|
|
|
|
|
|
class FilenameEncodingError(Exception):
|
|
|
|
"""
|
|
|
|
Filename cannot be encoded using the current encoding of your filesystem
|
|
|
|
(%s). Please configure your locale correctly or rename this file.
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def listdir_unicode_fallback(path):
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
2010-06-07 01:02:15 +00:00
|
|
|
This function emulates a fallback Unicode API similar to one available
|
2010-05-20 00:41:05 +00:00
|
|
|
under Windows or MacOS X.
|
|
|
|
|
|
|
|
If badly encoded filenames are encountered, an exception is raised.
|
|
|
|
"""
|
|
|
|
precondition(isinstance(path, unicode), path)
|
|
|
|
|
|
|
|
try:
|
2010-06-07 01:02:15 +00:00
|
|
|
byte_path = path.encode(filesystem_encoding)
|
2010-06-09 05:54:48 +00:00
|
|
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
2010-05-20 00:41:05 +00:00
|
|
|
raise FilenameEncodingError(path)
|
|
|
|
|
|
|
|
try:
|
2010-06-07 01:02:15 +00:00
|
|
|
return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
|
2010-05-20 00:41:05 +00:00
|
|
|
except UnicodeDecodeError:
|
|
|
|
raise FilenameEncodingError(fn)
|
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def listdir_unicode(path):
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
|
|
|
Wrapper around listdir() which provides safe access to the convenient
|
2010-06-07 01:02:15 +00:00
|
|
|
Unicode API even under platforms that don't provide one natively.
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
|
|
|
precondition(isinstance(path, unicode), path)
|
|
|
|
|
|
|
|
# On Windows and MacOS X, the Unicode API is used
|
|
|
|
# On other platforms (ie. Unix systems), the byte-level API is used
|
2010-06-07 01:02:15 +00:00
|
|
|
|
|
|
|
if is_unicode_platform:
|
|
|
|
dirlist = os.listdir(path)
|
2010-05-20 00:41:05 +00:00
|
|
|
else:
|
2010-06-07 01:02:15 +00:00
|
|
|
dirlist = listdir_unicode_fallback(path)
|
2010-05-20 00:41:05 +00:00
|
|
|
|
|
|
|
# Normalize the resulting unicode filenames
|
|
|
|
#
|
2010-06-07 01:02:15 +00:00
|
|
|
# This prevents different OSes from generating non-equal unicode strings for
|
2010-05-20 00:41:05 +00:00
|
|
|
# the same filename representation
|
|
|
|
return [unicodedata.normalize('NFC', fname) for fname in dirlist]
|
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
def open_unicode(path, mode):
|
2010-05-20 00:41:05 +00:00
|
|
|
"""
|
|
|
|
Wrapper around open() which provides safe access to the convenient Unicode
|
|
|
|
API even under Unix.
|
|
|
|
"""
|
|
|
|
precondition(isinstance(path, unicode), path)
|
|
|
|
|
2010-06-07 01:02:15 +00:00
|
|
|
if is_unicode_platform:
|
|
|
|
return open(os.path.expanduser(path), mode)
|
2010-05-20 00:41:05 +00:00
|
|
|
else:
|
|
|
|
try:
|
2010-06-07 01:02:15 +00:00
|
|
|
return open(os.path.expanduser(path.encode(filesystem_encoding)), mode)
|
2010-05-20 00:41:05 +00:00
|
|
|
except UnicodeEncodeError:
|
|
|
|
raise FilenameEncodingError(path)
|
2010-06-07 01:02:15 +00:00
|
|
|
|
|
|
|
def abspath_expanduser_unicode(path):
|
|
|
|
precondition(isinstance(path, unicode), path)
|
|
|
|
|
|
|
|
if is_unicode_platform:
|
|
|
|
return os.path.abspath(os.path.expanduser(path))
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
pathstr = path.encode(filesystem_encoding)
|
|
|
|
return os.path.abspath(os.path.expanduser(pathstr)).decode(filesystem_encoding)
|
|
|
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
|
|
|
raise FilenameEncodingError(path)
|