util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135

This commit is contained in:
david-sarah 2010-07-23 00:53:14 -07:00
parent 618db4867c
commit 28e6ad51a7
2 changed files with 150 additions and 18 deletions

View File

@ -57,8 +57,8 @@ import os, sys, locale
from allmydata.test.common_util import ReallyEqualMixin
from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \
get_output_encoding, get_filesystem_encoding, _reload
unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload
from allmydata.dirnode import normalize
from twisted.python import usage
@ -286,6 +286,103 @@ class StdlibUnicode(unittest.TestCase):
self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
def _check(self, inp, out, enc, optional_quotes):
out2 = out
if optional_quotes:
out2 = out2[1:-1]
self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
if out[0:2] != 'b"':
if isinstance(inp, str):
self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
else:
self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
def _test_quote_output_all(self, enc):
def check(inp, out, optional_quotes=False):
self._check(inp, out, enc, optional_quotes)
# optional single quotes
check("foo", "'foo'", True)
check("\\", "'\\'", True)
check("$\"`", "'$\"`'", True)
# mandatory single quotes
check("\"", "'\"'")
# double quotes
check("'", "\"'\"")
check("\n", "\"\\x0a\"")
check("\x00", "\"\\x00\"")
# invalid Unicode and astral planes
check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
check(u"\uD800\uDC00", "\"\\U00010000\"")
check(u"\uD800\uDC01", "\"\\U00010001\"")
check(u"\uD801\uDC00", "\"\\U00010400\"")
check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
# invalid UTF-8
check("\xFF", "b\"\\xff\"")
check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
def test_quote_output_ascii(self, enc='ascii'):
def check(inp, out, optional_quotes=False):
self._check(inp, out, enc, optional_quotes)
self._test_quote_output_all(enc)
check(u"\u00D7", "\"\\xd7\"")
check(u"'\u00D7", "\"'\\xd7\"")
check(u"\"\u00D7", "\"\\\"\\xd7\"")
check(u"\u2621", "\"\\u2621\"")
check(u"'\u2621", "\"'\\u2621\"")
check(u"\"\u2621", "\"\\\"\\u2621\"")
def test_quote_output_latin1(self, enc='latin1'):
def check(inp, out, optional_quotes=False):
self._check(inp, out.encode('latin1'), enc, optional_quotes)
self._test_quote_output_all(enc)
check(u"\u00D7", u"'\u00D7'", True)
check(u"'\u00D7", u"\"'\u00D7\"")
check(u"\"\u00D7", u"'\"\u00D7'")
check(u"\u00D7\"", u"'\u00D7\"'", True)
check(u"\u2621", u"\"\\u2621\"")
check(u"'\u2621", u"\"'\\u2621\"")
check(u"\"\u2621", u"\"\\\"\\u2621\"")
def test_quote_output_utf8(self, enc='utf-8'):
def check(inp, out, optional_quotes=False):
self._check(inp, out.encode('utf-8'), enc, optional_quotes)
self._test_quote_output_all(enc)
check(u"\u2621", u"'\u2621'", True)
check(u"'\u2621", u"\"'\u2621\"")
check(u"\"\u2621", u"'\"\u2621'")
check(u"\u2621\"", u"'\u2621\"'", True)
@patch('sys.stdout')
def test_quote_output_mock(self, mock_stdout):
mock_stdout.encoding = 'ascii'
_reload()
self.test_quote_output_ascii(None)
mock_stdout.encoding = 'latin1'
_reload()
self.test_quote_output_latin1(None)
mock_stdout.encoding = 'utf-8'
_reload()
self.test_quote_output_utf8(None)
class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
output = 'lumi\xc3\xa8re'

View File

@ -120,8 +120,8 @@ def to_argv(s):
return s
return s.encode(argv_encoding)
PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
def is_printable_ascii(s):
return PRINTABLE_ASCII.search(s) is not None
@ -145,12 +145,47 @@ def unicode_to_output(s):
(output_encoding, repr(s)))
return out
def _unicode_escape(m):
u = m.group(0)
if u == '"' or u == '$' or u == '`' or u == '\\':
return u'\\' + u
if len(u) == 2:
codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
else:
codepoint = ord(u)
if codepoint > 0xFFFF:
return u'\\U%08x' % (codepoint,)
elif codepoint > 0xFF:
return u'\\u%04x' % (codepoint,)
else:
return u'\\x%02x' % (codepoint,)
def _str_escape(m):
c = m.group(0)
if c == '"' or c == '$' or c == '`' or c == '\\':
return '\\' + c
else:
return '\\x%02x' % (ord(c),)
MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
# if we must double-quote, then we have to escape ", $ and `, but need not escape '
ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
re.DOTALL)
ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
def quote_output(s, quotemarks=True, encoding=None):
"""
Encode either a Unicode string or a UTF-8-encoded bytestring for representation
on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
always surrounded by single quotes; otherwise, it is quoted only if necessary to
avoid ambiguity or control bytes in the output.
always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
control bytes in the output.
Quoting may use either single or double quotes. Within single quotes, all
characters stand for themselves, and ' will not appear. Within double quotes,
Python-compatible backslash escaping is used.
"""
precondition(isinstance(s, (str, unicode)), s)
@ -158,20 +193,20 @@ def quote_output(s, quotemarks=True, encoding=None):
try:
s = s.decode('utf-8')
except UnicodeDecodeError:
return 'b' + repr(s)
return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
try:
out = s.encode(encoding or output_encoding)
except (UnicodeEncodeError, UnicodeDecodeError):
return repr(s)
if MUST_DOUBLE_QUOTE.search(s) is None:
try:
out = s.encode(encoding or output_encoding)
if quotemarks or out.startswith('"'):
return "'%s'" % (out,)
else:
return out
except (UnicodeDecodeError, UnicodeEncodeError):
pass
if PRINTABLE_8BIT.search(out) is None:
return repr(out)
if quotemarks:
return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
else:
return out
escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
def quote_path(path, quotemarks=True):
return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)