tahoe-lafs/src/allmydata/util/encodingutil.py

"""
Functions used to convert inputs from whatever encoding used in the system to
unicode and back.
"""

import sys
import re
from allmydata.util.assertutil import precondition
from twisted.python import usage
import locale
from allmydata.util import log


def _canonical_encoding(encoding):
    if encoding is None:
        log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
        encoding = 'utf-8'
    encoding = encoding.lower()
    if encoding == "cp65001":
        encoding = 'utf-8'
    elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
        encoding = 'ascii'

    # sometimes Python returns an encoding name that it doesn't support for conversion
    # fail early if this happens
    try:
        u"test".encode(encoding)
    except (LookupError, AttributeError):
        raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))

    return encoding

filesystem_encoding = None
output_encoding = None
argv_encoding = None
is_unicode_platform = False

def _reload():
    global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform

    filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())

    outenc = None
    if hasattr(sys.stdout, 'encoding'):
        outenc = sys.stdout.encoding
    if outenc is None:
        try:
            outenc = locale.getpreferredencoding()
        except Exception:
            pass  # work around <http://bugs.python.org/issue1443504>
    output_encoding = _canonical_encoding(outenc)

    if sys.platform == 'win32':
        # Unicode arguments are not supported on Windows yet; see #565 and #1074.
        argv_encoding = 'ascii'
    else:
        argv_encoding = output_encoding
    is_unicode_platform = sys.platform in ["win32", "darwin"]

_reload()


def get_filesystem_encoding():
    """
    Returns expected encoding for local filenames.
    """
    return filesystem_encoding

def get_output_encoding():
    """
    Returns expected encoding for writing to stdout or stderr.
    """
    return output_encoding

def get_argv_encoding():
    """
    Returns expected encoding for command-line arguments.
    """
    return argv_encoding

def argv_to_unicode(s):
    """
    Decode given argv element to unicode. If this fails, raise a UsageError.
    """
    precondition(isinstance(s, str), s)

    try:
        return unicode(s, argv_encoding)
    except UnicodeDecodeError:
        raise usage.UsageError("Argument %s cannot be decoded as %s." %
                               (quote_output(s), argv_encoding))

def unicode_to_url(s):
    """
    Encode an unicode object used in an URL.
    """
    # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.

    # FIXME
    return to_str(s)
    #precondition(isinstance(s, unicode), s)
    #return s.encode('utf-8')

def to_str(s):
    if s is None or isinstance(s, str):
        return s
    return s.encode('utf-8')

def to_argv(s):
    if isinstance(s, str):
        return s
    return s.encode(argv_encoding)

PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)

def is_printable_ascii(s):
    return PRINTABLE_ASCII.search(s) is not None

def unicode_to_output(s):
    """
    Encode an unicode object for representation on stdout or stderr.
    """
    precondition(isinstance(s, unicode), s)

    try:
        out = s.encode(output_encoding)
    except (UnicodeEncodeError, UnicodeDecodeError):
        raise UnicodeEncodeError(output_encoding, s, 0, 0,
                                 "A string could not be encoded as %s for output to the terminal:\n%r" %
                                 (output_encoding, repr(s)))

    if PRINTABLE_8BIT.search(out) is None:
        raise UnicodeEncodeError(output_encoding, s, 0, 0,
                                 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
                                 (output_encoding, repr(s)))
    return out

def quote_output(s, quotemarks=True, encoding=None):
    """
    Encode either a Unicode string or a UTF-8-encoded bytestring for representation
    on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
    always surrounded by single quotes; otherwise, it is quoted only if necessary to
    avoid ambiguity or control bytes in the output.
    """
    precondition(isinstance(s, (str, unicode)), s)

    if isinstance(s, str):
        try:
            s = s.decode('utf-8')
        except UnicodeDecodeError:
            return 'b' + repr(s)

    try:
        out = s.encode(encoding or output_encoding)
    except (UnicodeEncodeError, UnicodeDecodeError):
        return repr(s)

    if PRINTABLE_8BIT.search(out) is None:
        return repr(out)

    if quotemarks:
        return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
    else:
        return out

def quote_path(path, quotemarks=True):
    return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)


def unicode_platform():
    """
    Does the current platform handle Unicode filenames natively?
    """
    return is_unicode_platform
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
			`Functions used to convert inputs from whatever encoding used in the system to`
			`unicode and back.`
			`"""`

			`import sys`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`import re`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`from allmydata.util.assertutil import precondition`
			`from twisted.python import usage`
unicode: make test_cli test a non-ascii argument, and make the fallback term encoding be locale.getpreferredencoding() 2010-06-04 14:12:51 +00:00			`import locale`
stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00			`from allmydata.util import log`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
Unicode fixes. 2010-06-07 01:02:15 +00:00
			`def _canonical_encoding(encoding):`
			`if encoding is None:`
stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00			`log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`encoding = 'utf-8'`
			`encoding = encoding.lower()`
			`if encoding == "cp65001":`
			`encoding = 'utf-8'`
stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00			`elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`encoding = 'ascii'`

			`# sometimes Python returns an encoding name that it doesn't support for conversion`
			`# fail early if this happens`
			`try:`
			`u"test".encode(encoding)`
stringutils.py, sftpd.py: Portability fixes for Python <= 2.5. 2010-06-09 01:33:02 +00:00			`except (LookupError, AttributeError):`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))`

			`return encoding`

			`filesystem_encoding = None`
			`output_encoding = None`
			`argv_encoding = None`
			`is_unicode_platform = False`

			`def _reload():`
			`global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform`

			`filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())`
stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00
stringutils.py: tolerate sys.stdout having no 'encoding' attribute. 2010-06-26 04:08:17 +00:00			`outenc = None`
			`if hasattr(sys.stdout, 'encoding'):`
			`outenc = sys.stdout.encoding`
stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00			`if outenc is None:`
			`try:`
			`outenc = locale.getpreferredencoding()`
			`except Exception:`
			`pass # work around <http://bugs.python.org/issue1443504>`
			`output_encoding = _canonical_encoding(outenc)`

Unicode fixes. 2010-06-07 01:02:15 +00:00			`if sys.platform == 'win32':`
Back out Windows-specific Unicode argument support for v1.7. 2010-06-09 00:08:03 +00:00			`# Unicode arguments are not supported on Windows yet; see #565 and #1074.`
			`argv_encoding = 'ascii'`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`else:`
			`argv_encoding = output_encoding`
			`is_unicode_platform = sys.platform in ["win32", "darwin"]`

			`_reload()`


			`def get_filesystem_encoding():`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`Returns expected encoding for local filenames.`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`return filesystem_encoding`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
Unicode fixes. 2010-06-07 01:02:15 +00:00			`def get_output_encoding():`
			`"""`
			`Returns expected encoding for writing to stdout or stderr.`
			`"""`
			`return output_encoding`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
Unicode fixes. 2010-06-07 01:02:15 +00:00			`def get_argv_encoding():`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`Returns expected encoding for command-line arguments.`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`return argv_encoding`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
Unicode fixes. 2010-06-07 01:02:15 +00:00			`def argv_to_unicode(s):`
			`"""`
			`Decode given argv element to unicode. If this fails, raise a UsageError.`
			`"""`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`precondition(isinstance(s, str), s)`

			`try:`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`return unicode(s, argv_encoding)`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`except UnicodeDecodeError:`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`raise usage.UsageError("Argument %s cannot be decoded as %s." %`
			`(quote_output(s), argv_encoding))`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
			`def unicode_to_url(s):`
			`"""`
			`Encode an unicode object used in an URL.`
			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
Unicode fixes. 2010-06-07 01:02:15 +00:00			`# FIXME`
			`return to_str(s)`
			`#precondition(isinstance(s, unicode), s)`
			`#return s.encode('utf-8')`

			`def to_str(s):`
			`if s is None or isinstance(s, str):`
			`return s`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`return s.encode('utf-8')`

Unicode fixes. 2010-06-07 01:02:15 +00:00			`def to_argv(s):`
			`if isinstance(s, str):`
			`return s`
			`return s.encode(argv_encoding)`

			`PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)`
			`PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)`

			`def is_printable_ascii(s):`
			`return PRINTABLE_ASCII.search(s) is not None`

			`def unicode_to_output(s):`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`Encode an unicode object for representation on stdout or stderr.`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
			`precondition(isinstance(s, unicode), s)`

Unicode fixes. 2010-06-07 01:02:15 +00:00			`try:`
			`out = s.encode(output_encoding)`
Fix for Unicode-related test failures on Zooko's OS X 10.6 machine. 2010-06-09 05:54:48 +00:00			`except (UnicodeEncodeError, UnicodeDecodeError):`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`raise UnicodeEncodeError(output_encoding, s, 0, 0,`
			`"A string could not be encoded as %s for output to the terminal:\n%r" %`
			`(output_encoding, repr(s)))`

			`if PRINTABLE_8BIT.search(out) is None:`
			`raise UnicodeEncodeError(output_encoding, s, 0, 0,`
			`"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %`
			`(output_encoding, repr(s)))`
			`return out`

stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00			`def quote_output(s, quotemarks=True, encoding=None):`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`Encode either a Unicode string or a UTF-8-encoded bytestring for representation`
			`on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is`
			`always surrounded by single quotes; otherwise, it is quoted only if necessary to`
			`avoid ambiguity or control bytes in the output.`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00			`"""`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`precondition(isinstance(s, (str, unicode)), s)`

			`if isinstance(s, str):`
			`try:`
			`s = s.decode('utf-8')`
			`except UnicodeDecodeError:`
			`return 'b' + repr(s)`

			`try:`
stringutils.py: Add encoding argument to quote_output. Also work around a bug in locale.getpreferredencoding on older Pythons. 2010-06-16 04:20:12 +00:00			`out = s.encode(encoding or output_encoding)`
Fix for Unicode-related test failures on Zooko's OS X 10.6 machine. 2010-06-09 05:54:48 +00:00			`except (UnicodeEncodeError, UnicodeDecodeError):`
Unicode fixes. 2010-06-07 01:02:15 +00:00			`return repr(s)`

			`if PRINTABLE_8BIT.search(out) is None:`
			`return repr(out)`
stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. 2010-05-20 00:41:05 +00:00
Unicode fixes. 2010-06-07 01:02:15 +00:00			`if quotemarks:`
			`return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"`
			`else:`
			`return out`

			`def quote_path(path, quotemarks=True):`
			`return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)`


			`def unicode_platform():`
			`"""`
			`Does the current platform handle Unicode filenames natively?`
			`"""`
			`return is_unicode_platform`