stringutils.py: Unicode helper functions + associated tests

This file contains a bunch of helper functions which converts
unicode string from and to argv, filenames and stdout.
This commit is contained in:
Francois Deppierraz 2010-05-19 17:41:05 -07:00
parent b2542b8708
commit d0ed14e1bb
2 changed files with 387 additions and 0 deletions

View File

@ -0,0 +1,259 @@
# coding=utf-8
TEST_FILENAMES = (
u'Ärtonwall.mp3',
u'test_file',
u'Blah blah.txt',
)
# The following main helps to generate a test class for other operating
# systems.
if __name__ == "__main__":
import sys, os
import tempfile
import shutil
import platform
if len(sys.argv) != 2:
print "Usage: %s lumière" % sys.argv[0]
sys.exit(1)
print
print "class MyWeirdOS(StringUtils, unittest.TestCase):"
print " uname = '%s'" % ' '.join(platform.uname())
print " argv = %s" % repr(sys.argv[1])
print " platform = '%s'" % sys.platform
print " filesystemencoding = '%s'" % sys.getfilesystemencoding()
print " stdoutencoding = '%s'" % sys.stdout.encoding
try:
tmpdir = tempfile.mkdtemp()
for fname in TEST_FILENAMES:
open(os.path.join(tmpdir, fname), 'w').close()
# Use Unicode API under Windows or MacOS X
if sys.platform in ('win32', 'darwin'):
dirlist = os.listdir(unicode(tmpdir))
else:
dirlist = os.listdir(tmpdir)
print " dirlist = %s" % repr(dirlist)
except:
print " # Oops, I cannot write filenames containing non-ascii characters"
print
shutil.rmtree(tmpdir)
sys.exit(0)
from twisted.trial import unittest
from mock import patch
import sys
from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \
unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode, \
FilenameEncodingError, get_term_encoding
from twisted.python import usage
class StringUtilsErrors(unittest.TestCase):
@patch('sys.stdout')
def test_get_term_encoding(self, mock):
mock.encoding = None
self.failUnlessEqual(get_term_encoding(), 'ascii')
@patch('sys.stdout')
def test_argv_to_unicode(self, mock):
mock.encoding = 'utf-8'
self.failUnlessRaises(usage.UsageError,
argv_to_unicode,
u'lumière'.encode('latin1'))
def test_unicode_to_url(self):
pass
@patch('sys.stdout')
def test_unicode_to_stdout(self, mock):
# Encoding koi8-r cannot represent 'è'
mock.encoding = 'koi8-r'
self.failUnlessEqual(unicode_to_stdout(u'lumière'), 'lumi?re')
@patch('os.listdir')
def test_unicode_normalization(self, mock):
# Pretend to run on an Unicode platform such as Windows
orig_platform = sys.platform
sys.platform = 'win32'
mock.return_value = [u'A\u0308rtonwall.mp3']
self.failUnlessEqual(listdir_unicode(u'/dummy'), [u'\xc4rtonwall.mp3'])
sys.platform = orig_platform
# The following tests applies only to platforms which don't store filenames as
# Unicode entities on the filesystem.
class StringUtilsNonUnicodePlatform(unittest.TestCase):
def setUp(self):
# Mock sys.platform because unicode_platform() uses it
self.original_platform = sys.platform
sys.platform = 'linux'
def tearDown(self):
sys.platform = self.original_platform
@patch('sys.getfilesystemencoding')
@patch('os.listdir')
def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
# What happen if a latin1-encoded filenames is encountered on an UTF-8
# filesystem?
mock_listdir.return_value = [
u'lumière'.encode('utf-8'),
u'lumière'.encode('latin1')]
mock_getfilesystemencoding.return_value = 'utf-8'
self.failUnlessRaises(FilenameEncodingError,
listdir_unicode,
u'/dummy')
# We're trying to list a directory whose name cannot be represented in
# the filesystem encoding. This should fail.
mock_getfilesystemencoding.return_value = 'ascii'
self.failUnlessRaises(FilenameEncodingError,
listdir_unicode,
u'/lumière')
@patch('sys.getfilesystemencoding')
def test_open_unicode(self, mock):
mock.return_value = 'ascii'
self.failUnlessRaises(FilenameEncodingError,
open_unicode,
u'lumière')
class StringUtils():
def setUp(self):
# Mock sys.platform because unicode_platform() uses it
self.original_platform = sys.platform
sys.platform = self.platform
def tearDown(self):
sys.platform = self.original_platform
@patch('sys.stdout')
def test_argv_to_unicode(self, mock):
if 'argv' not in dir(self):
raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
mock.encoding = self.stdoutencoding
argu = u'lumière'
argv = self.argv
self.failUnlessEqual(argv_to_unicode(argv), argu)
def test_unicode_to_url(self):
self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8'))
@patch('sys.stdout')
def test_unicode_to_stdout(self, mock):
if 'argv' not in dir(self):
raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
mock.encoding = self.stdoutencoding
self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv)
def test_unicode_platform(self):
matrix = {
'linux2': False,
'win32': True,
'darwin': True,
}
self.failUnlessEqual(unicode_platform(), matrix[self.platform])
@patch('sys.getfilesystemencoding')
@patch('os.listdir')
def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
mock_listdir.return_value = self.dirlist
mock_getfilesystemencoding.return_value = self.filesystemencoding
filenames = listdir_unicode(u'/dummy')
for fname in TEST_FILENAMES:
self.failUnless(isinstance(fname, unicode))
if fname not in filenames:
self.fail("Cannot find %r in %r" % (fname, filenames))
@patch('os.open')
def test_open_unicode(self, mock):
self.failUnlessRaises(IOError,
open_unicode,
u'/dummy_directory/lumière.txt')
class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
argv = 'lumi\xc3\xa8re'
platform = 'linux2'
filesystemencoding = 'UTF-8'
stdoutencoding = 'UTF-8'
dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
argv = 'lumi\xe8re'
platform = 'linux2'
filesystemencoding = 'ISO-8859-1'
stdoutencoding = 'ISO-8859-1'
dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
class WindowsXP(StringUtils, unittest.TestCase):
uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
argv = 'lumi\xe8re'
platform = 'win32'
filesystemencoding = 'mbcs'
stdoutencoding = 'cp850'
dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
class WindowsXP_UTF8(StringUtils, unittest.TestCase):
uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
argv = 'lumi\xe8re'
platform = 'win32'
filesystemencoding = 'mbcs'
stdoutencoding = 'cp65001'
dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
class WindowsVista(StringUtils, unittest.TestCase):
uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
argv = 'lumi\xe8re'
platform = 'win32'
filesystemencoding = 'mbcs'
stdoutencoding = 'cp850'
dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
class MacOSXLeopard(StringUtils, unittest.TestCase):
uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
argv = 'lumi\xc3\xa8re'
platform = 'darwin'
filesystemencoding = 'utf-8'
stdoutencoding = 'UTF-8'
dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
class MacOSXLeopard7bit(StringUtils, unittest.TestCase):
uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
#argv = 'lumiere'
platform = 'darwin'
filesystemencoding = 'utf-8'
stdoutencoding = 'US-ASCII'
dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']

View File

@ -0,0 +1,128 @@
"""
Functions used to convert inputs from whatever encoding used in the system to
unicode and back.
"""
import sys
import os
import unicodedata
from allmydata.util.assertutil import precondition
from twisted.python import usage
def get_term_encoding():
"""
Returns expected encoding for writing to the terminal and reading
arguments from the command-line.
"""
if sys.stdout.encoding == None:
return 'ascii'
else:
return sys.stdout.encoding
def argv_to_unicode(s):
"""
Decode given argv element to unicode.
"""
# Try to decode the command-line argument with the encoding returned by
# get_term_encoding(), if this fails print an error message to the user.
precondition(isinstance(s, str), s)
try:
return unicode(s, get_term_encoding())
except UnicodeDecodeError:
raise usage.UsageError("Argument '%s' cannot be decoded as %s." %
(s, get_term_encoding()))
def unicode_to_url(s):
"""
Encode an unicode object used in an URL.
"""
# According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded.
precondition(isinstance(s, unicode), s)
return s.encode('utf-8')
def unicode_to_stdout(s):
"""
Encode an unicode object for representation on stdout.
"""
precondition(isinstance(s, unicode), s)
return s.encode(get_term_encoding(), 'replace')
def unicode_platform():
"""
Does the current platform handle Unicode filenames natively ?
"""
return sys.platform in ('win32', 'darwin')
class FilenameEncodingError(Exception):
"""
Filename cannot be encoded using the current encoding of your filesystem
(%s). Please configure your locale correctly or rename this file.
"""
pass
def listdir_unicode_unix(path):
"""
This function emulates an Unicode API under Unix similar to one available
under Windows or MacOS X.
If badly encoded filenames are encountered, an exception is raised.
"""
precondition(isinstance(path, unicode), path)
encoding = sys.getfilesystemencoding()
try:
byte_path = path.encode(encoding)
except UnicodeEncodeError:
raise FilenameEncodingError(path)
try:
return [unicode(fn, encoding) for fn in os.listdir(byte_path)]
except UnicodeDecodeError:
raise FilenameEncodingError(fn)
def listdir_unicode(path, encoding = None):
"""
Wrapper around listdir() which provides safe access to the convenient
Unicode API even under Unix.
"""
precondition(isinstance(path, unicode), path)
# On Windows and MacOS X, the Unicode API is used
if unicode_platform():
dirlist = os.listdir(path)
# On other platforms (ie. Unix systems), the byte-level API is used
else:
dirlist = listdir_unicode_unix(path)
# Normalize the resulting unicode filenames
#
# This prevents different OS from generating non-equal unicode strings for
# the same filename representation
return [unicodedata.normalize('NFC', fname) for fname in dirlist]
def open_unicode(path, mode='r'):
"""
Wrapper around open() which provides safe access to the convenient Unicode
API even under Unix.
"""
precondition(isinstance(path, unicode), path)
if unicode_platform():
return open(path, mode)
else:
encoding = sys.getfilesystemencoding()
try:
return open(path.encode(encoding), mode)
except UnicodeEncodeError:
raise FilenameEncodingError(path)