""" Functions used to convert inputs from whatever encoding used in the system to unicode and back. """ import sys, os, re, locale from types import NoneType from allmydata.util.assertutil import precondition, _assert from twisted.python import usage from twisted.python.filepath import FilePath from allmydata.util import log from allmydata.util.fileutil import abspath_expanduser_unicode def canonical_encoding(encoding): if encoding is None: log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD) encoding = 'utf-8' encoding = encoding.lower() if encoding == "cp65001": encoding = 'utf-8' elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968": encoding = 'ascii' return encoding def check_encoding(encoding): # sometimes Python returns an encoding name that it doesn't support for conversion # fail early if this happens try: u"test".encode(encoding) except (LookupError, AttributeError): raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,)) filesystem_encoding = None io_encoding = None is_unicode_platform = False use_unicode_filepath = False def _reload(): global filesystem_encoding, io_encoding, is_unicode_platform, use_unicode_filepath filesystem_encoding = canonical_encoding(sys.getfilesystemencoding()) check_encoding(filesystem_encoding) if sys.platform == 'win32': # On Windows we install UTF-8 stream wrappers for sys.stdout and # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py). io_encoding = 'utf-8' else: ioenc = None if hasattr(sys.stdout, 'encoding'): ioenc = sys.stdout.encoding if ioenc is None: try: ioenc = locale.getpreferredencoding() except Exception: pass # work around io_encoding = canonical_encoding(ioenc) check_encoding(io_encoding) is_unicode_platform = sys.platform in ["win32", "darwin"] # Despite the Unicode-mode FilePath support added to Twisted in # , we can't yet use # Unicode-mode FilePaths with INotify on non-Windows platforms # due to . use_unicode_filepath = sys.platform == "win32" _reload() def get_filesystem_encoding(): """ Returns expected encoding for local filenames. """ return filesystem_encoding def get_io_encoding(): """ Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv. """ return io_encoding def argv_to_unicode(s): """ Decode given argv element to unicode. If this fails, raise a UsageError. """ precondition(isinstance(s, str), s) try: return unicode(s, io_encoding) except UnicodeDecodeError: raise usage.UsageError("Argument %s cannot be decoded as %s." % (quote_output(s), io_encoding)) def argv_to_abspath(s, **kwargs): """ Convenience function to decode an argv element to an absolute path, with ~ expanded. If this fails, raise a UsageError. """ decoded = argv_to_unicode(s) if decoded.startswith(u'-'): raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file." % (quote_output(s), quote_output(os.path.join('.', s)))) return abspath_expanduser_unicode(decoded, **kwargs) def unicode_to_argv(s, mangle=False): """ Encode the given Unicode argument as a bytestring. If the argument is to be passed to a different process, then the 'mangle' argument should be true; on Windows, this uses a mangled encoding that will be reversed by code in runner.py. """ precondition(isinstance(s, unicode), s) if mangle and sys.platform == "win32": # This must be the same as 'mangle' in bin/tahoe-script.template. return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s)) else: return s.encode(io_encoding) def unicode_to_url(s): """ Encode an unicode object used in an URL. """ # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded. # FIXME return to_str(s) #precondition(isinstance(s, unicode), s) #return s.encode('utf-8') def to_str(s): if s is None or isinstance(s, str): return s return s.encode('utf-8') def from_utf8_or_none(s): precondition(isinstance(s, (NoneType, str)), s) if s is None: return s return s.decode('utf-8') PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL) PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) def is_printable_ascii(s): return PRINTABLE_ASCII.search(s) is not None def unicode_to_output(s): """ Encode an unicode object for representation on stdout or stderr. """ precondition(isinstance(s, unicode), s) try: out = s.encode(io_encoding) except (UnicodeEncodeError, UnicodeDecodeError): raise UnicodeEncodeError(io_encoding, s, 0, 0, "A string could not be encoded as %s for output to the terminal:\n%r" % (io_encoding, repr(s))) if PRINTABLE_8BIT.search(out) is None: raise UnicodeEncodeError(io_encoding, s, 0, 0, "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" % (io_encoding, repr(s))) return out def _unicode_escape(m, quote_newlines): u = m.group(0) if u == u'"' or u == u'$' or u == u'`' or u == u'\\': return u'\\' + u elif u == u'\n' and not quote_newlines: return u if len(u) == 2: codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 else: codepoint = ord(u) if codepoint > 0xFFFF: return u'\\U%08x' % (codepoint,) elif codepoint > 0xFF: return u'\\u%04x' % (codepoint,) else: return u'\\x%02x' % (codepoint,) def _str_escape(m, quote_newlines): c = m.group(0) if c == '"' or c == '$' or c == '`' or c == '\\': return '\\' + c elif c == '\n' and not quote_newlines: return c else: return '\\x%02x' % (ord(c),) MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) MUST_DOUBLE_QUOTE = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) # if we must double-quote, then we have to escape ", $ and `, but need not escape ' ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): """ Encode either a Unicode string or a UTF-8-encoded bytestring for representation on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or control bytes in the output. (Newlines are counted as control bytes iff quote_newlines is True.) Quoting may use either single or double quotes. Within single quotes, all characters stand for themselves, and ' will not appear. Within double quotes, Python-compatible backslash escaping is used. If not explicitly given, quote_newlines is True when quotemarks is True. """ precondition(isinstance(s, (str, unicode)), s) if quote_newlines is None: quote_newlines = quotemarks if isinstance(s, str): try: s = s.decode('utf-8') except UnicodeDecodeError: return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),) must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE if must_double_quote.search(s) is None: try: out = s.encode(encoding or io_encoding) if quotemarks or out.startswith('"'): return "'%s'" % (out,) else: return out except (UnicodeDecodeError, UnicodeEncodeError): pass escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s) return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),) def quote_path(path, quotemarks=True): return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True) def quote_local_unicode_path(path, quotemarks=True): precondition(isinstance(path, unicode), path) if sys.platform == "win32" and path.startswith(u"\\\\?\\"): path = path[4 :] if path.startswith(u"UNC\\"): path = u"\\\\" + path[4 :] return quote_output(path, quotemarks=quotemarks, quote_newlines=True) def quote_filepath(path, quotemarks=True): return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks) def extend_filepath(fp, segments): # We cannot use FilePath.preauthChild, because # * it has the security flaw described in ; # * it may return a FilePath in the wrong mode. for segment in segments: fp = fp.child(segment) if isinstance(fp.path, unicode) and not use_unicode_filepath: return FilePath(fp.path.encode(filesystem_encoding)) else: return fp def to_filepath(path): precondition(isinstance(path, unicode if use_unicode_filepath else basestring), path=path) if isinstance(path, unicode) and not use_unicode_filepath: path = path.encode(filesystem_encoding) if sys.platform == "win32": _assert(isinstance(path, unicode), path=path) if path.startswith(u"\\\\?\\") and len(path) > 4: # FilePath normally strips trailing path separators, but not in this case. path = path.rstrip(u"\\") return FilePath(path) def _decode(s): precondition(isinstance(s, basestring), s=s) if isinstance(s, bytes): return s.decode(filesystem_encoding) else: return s def unicode_from_filepath(fp): precondition(isinstance(fp, FilePath), fp=fp) return _decode(fp.path) def unicode_segments_from(base_fp, ancestor_fp): precondition(isinstance(base_fp, FilePath), base_fp=base_fp) precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp) return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode()) def unicode_platform(): """ Does the current platform handle Unicode filenames natively? """ return is_unicode_platform class FilenameEncodingError(Exception): """ Filename cannot be encoded using the current encoding of your filesystem (%s). Please configure your locale correctly or rename this file. """ pass def listdir_unicode_fallback(path): """ This function emulates a fallback Unicode API similar to one available under Windows or MacOS X. If badly encoded filenames are encountered, an exception is raised. """ precondition(isinstance(path, unicode), path) try: byte_path = path.encode(filesystem_encoding) except (UnicodeEncodeError, UnicodeDecodeError): raise FilenameEncodingError(path) try: return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)] except UnicodeDecodeError: raise FilenameEncodingError(fn) def listdir_unicode(path): """ Wrapper around listdir() which provides safe access to the convenient Unicode API even under platforms that don't provide one natively. """ precondition(isinstance(path, unicode), path) # On Windows and MacOS X, the Unicode API is used # On other platforms (ie. Unix systems), the byte-level API is used if is_unicode_platform: return os.listdir(path) else: return listdir_unicode_fallback(path) def listdir_filepath(fp): return listdir_unicode(unicode_from_filepath(fp))