mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2025-02-20 17:52:50 +00:00
stringutils.py: Unicode helper functions + associated tests
This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout.
This commit is contained in:
parent
b2542b8708
commit
d0ed14e1bb
259
src/allmydata/test/test_stringutils.py
Normal file
259
src/allmydata/test/test_stringutils.py
Normal file
@ -0,0 +1,259 @@
|
||||
# coding=utf-8
|
||||
|
||||
TEST_FILENAMES = (
|
||||
u'Ärtonwall.mp3',
|
||||
u'test_file',
|
||||
u'Blah blah.txt',
|
||||
)
|
||||
|
||||
# The following main helps to generate a test class for other operating
|
||||
# systems.
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys, os
|
||||
import tempfile
|
||||
import shutil
|
||||
import platform
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print "Usage: %s lumière" % sys.argv[0]
|
||||
sys.exit(1)
|
||||
|
||||
print
|
||||
print "class MyWeirdOS(StringUtils, unittest.TestCase):"
|
||||
print " uname = '%s'" % ' '.join(platform.uname())
|
||||
print " argv = %s" % repr(sys.argv[1])
|
||||
print " platform = '%s'" % sys.platform
|
||||
print " filesystemencoding = '%s'" % sys.getfilesystemencoding()
|
||||
print " stdoutencoding = '%s'" % sys.stdout.encoding
|
||||
|
||||
try:
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
for fname in TEST_FILENAMES:
|
||||
open(os.path.join(tmpdir, fname), 'w').close()
|
||||
|
||||
# Use Unicode API under Windows or MacOS X
|
||||
if sys.platform in ('win32', 'darwin'):
|
||||
dirlist = os.listdir(unicode(tmpdir))
|
||||
else:
|
||||
dirlist = os.listdir(tmpdir)
|
||||
|
||||
print " dirlist = %s" % repr(dirlist)
|
||||
except:
|
||||
print " # Oops, I cannot write filenames containing non-ascii characters"
|
||||
print
|
||||
|
||||
shutil.rmtree(tmpdir)
|
||||
sys.exit(0)
|
||||
|
||||
from twisted.trial import unittest
|
||||
from mock import patch
|
||||
import sys
|
||||
|
||||
from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \
|
||||
unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode, \
|
||||
FilenameEncodingError, get_term_encoding
|
||||
from twisted.python import usage
|
||||
|
||||
class StringUtilsErrors(unittest.TestCase):
|
||||
@patch('sys.stdout')
|
||||
def test_get_term_encoding(self, mock):
|
||||
mock.encoding = None
|
||||
|
||||
self.failUnlessEqual(get_term_encoding(), 'ascii')
|
||||
|
||||
@patch('sys.stdout')
|
||||
def test_argv_to_unicode(self, mock):
|
||||
mock.encoding = 'utf-8'
|
||||
|
||||
self.failUnlessRaises(usage.UsageError,
|
||||
argv_to_unicode,
|
||||
u'lumière'.encode('latin1'))
|
||||
|
||||
def test_unicode_to_url(self):
|
||||
pass
|
||||
|
||||
@patch('sys.stdout')
|
||||
def test_unicode_to_stdout(self, mock):
|
||||
# Encoding koi8-r cannot represent 'è'
|
||||
mock.encoding = 'koi8-r'
|
||||
self.failUnlessEqual(unicode_to_stdout(u'lumière'), 'lumi?re')
|
||||
|
||||
@patch('os.listdir')
|
||||
def test_unicode_normalization(self, mock):
|
||||
# Pretend to run on an Unicode platform such as Windows
|
||||
orig_platform = sys.platform
|
||||
sys.platform = 'win32'
|
||||
|
||||
mock.return_value = [u'A\u0308rtonwall.mp3']
|
||||
self.failUnlessEqual(listdir_unicode(u'/dummy'), [u'\xc4rtonwall.mp3'])
|
||||
|
||||
sys.platform = orig_platform
|
||||
|
||||
# The following tests applies only to platforms which don't store filenames as
|
||||
# Unicode entities on the filesystem.
|
||||
class StringUtilsNonUnicodePlatform(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# Mock sys.platform because unicode_platform() uses it
|
||||
self.original_platform = sys.platform
|
||||
sys.platform = 'linux'
|
||||
|
||||
def tearDown(self):
|
||||
sys.platform = self.original_platform
|
||||
|
||||
@patch('sys.getfilesystemencoding')
|
||||
@patch('os.listdir')
|
||||
def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
|
||||
# What happen if a latin1-encoded filenames is encountered on an UTF-8
|
||||
# filesystem?
|
||||
mock_listdir.return_value = [
|
||||
u'lumière'.encode('utf-8'),
|
||||
u'lumière'.encode('latin1')]
|
||||
|
||||
mock_getfilesystemencoding.return_value = 'utf-8'
|
||||
|
||||
self.failUnlessRaises(FilenameEncodingError,
|
||||
listdir_unicode,
|
||||
u'/dummy')
|
||||
|
||||
# We're trying to list a directory whose name cannot be represented in
|
||||
# the filesystem encoding. This should fail.
|
||||
mock_getfilesystemencoding.return_value = 'ascii'
|
||||
self.failUnlessRaises(FilenameEncodingError,
|
||||
listdir_unicode,
|
||||
u'/lumière')
|
||||
|
||||
@patch('sys.getfilesystemencoding')
|
||||
def test_open_unicode(self, mock):
|
||||
mock.return_value = 'ascii'
|
||||
|
||||
self.failUnlessRaises(FilenameEncodingError,
|
||||
open_unicode,
|
||||
u'lumière')
|
||||
|
||||
class StringUtils():
|
||||
def setUp(self):
|
||||
# Mock sys.platform because unicode_platform() uses it
|
||||
self.original_platform = sys.platform
|
||||
sys.platform = self.platform
|
||||
|
||||
def tearDown(self):
|
||||
sys.platform = self.original_platform
|
||||
|
||||
@patch('sys.stdout')
|
||||
def test_argv_to_unicode(self, mock):
|
||||
if 'argv' not in dir(self):
|
||||
raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
|
||||
|
||||
mock.encoding = self.stdoutencoding
|
||||
|
||||
argu = u'lumière'
|
||||
argv = self.argv
|
||||
|
||||
self.failUnlessEqual(argv_to_unicode(argv), argu)
|
||||
|
||||
def test_unicode_to_url(self):
|
||||
self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8'))
|
||||
|
||||
@patch('sys.stdout')
|
||||
def test_unicode_to_stdout(self, mock):
|
||||
if 'argv' not in dir(self):
|
||||
raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
|
||||
|
||||
mock.encoding = self.stdoutencoding
|
||||
self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv)
|
||||
|
||||
def test_unicode_platform(self):
|
||||
matrix = {
|
||||
'linux2': False,
|
||||
'win32': True,
|
||||
'darwin': True,
|
||||
}
|
||||
|
||||
self.failUnlessEqual(unicode_platform(), matrix[self.platform])
|
||||
|
||||
@patch('sys.getfilesystemencoding')
|
||||
@patch('os.listdir')
|
||||
def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
|
||||
|
||||
mock_listdir.return_value = self.dirlist
|
||||
mock_getfilesystemencoding.return_value = self.filesystemencoding
|
||||
|
||||
filenames = listdir_unicode(u'/dummy')
|
||||
|
||||
for fname in TEST_FILENAMES:
|
||||
self.failUnless(isinstance(fname, unicode))
|
||||
|
||||
if fname not in filenames:
|
||||
self.fail("Cannot find %r in %r" % (fname, filenames))
|
||||
|
||||
@patch('os.open')
|
||||
def test_open_unicode(self, mock):
|
||||
|
||||
self.failUnlessRaises(IOError,
|
||||
open_unicode,
|
||||
u'/dummy_directory/lumière.txt')
|
||||
|
||||
|
||||
class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
|
||||
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
|
||||
argv = 'lumi\xc3\xa8re'
|
||||
platform = 'linux2'
|
||||
filesystemencoding = 'UTF-8'
|
||||
stdoutencoding = 'UTF-8'
|
||||
dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
|
||||
|
||||
|
||||
class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
|
||||
uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
|
||||
argv = 'lumi\xe8re'
|
||||
platform = 'linux2'
|
||||
filesystemencoding = 'ISO-8859-1'
|
||||
stdoutencoding = 'ISO-8859-1'
|
||||
dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
|
||||
|
||||
class WindowsXP(StringUtils, unittest.TestCase):
|
||||
uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
|
||||
argv = 'lumi\xe8re'
|
||||
platform = 'win32'
|
||||
filesystemencoding = 'mbcs'
|
||||
stdoutencoding = 'cp850'
|
||||
dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
|
||||
|
||||
todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
|
||||
|
||||
class WindowsXP_UTF8(StringUtils, unittest.TestCase):
|
||||
uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
|
||||
argv = 'lumi\xe8re'
|
||||
platform = 'win32'
|
||||
filesystemencoding = 'mbcs'
|
||||
stdoutencoding = 'cp65001'
|
||||
dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
|
||||
|
||||
todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
|
||||
|
||||
class WindowsVista(StringUtils, unittest.TestCase):
|
||||
uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
|
||||
argv = 'lumi\xe8re'
|
||||
platform = 'win32'
|
||||
filesystemencoding = 'mbcs'
|
||||
stdoutencoding = 'cp850'
|
||||
dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
|
||||
|
||||
todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
|
||||
|
||||
class MacOSXLeopard(StringUtils, unittest.TestCase):
|
||||
uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
|
||||
argv = 'lumi\xc3\xa8re'
|
||||
platform = 'darwin'
|
||||
filesystemencoding = 'utf-8'
|
||||
stdoutencoding = 'UTF-8'
|
||||
dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
|
||||
|
||||
class MacOSXLeopard7bit(StringUtils, unittest.TestCase):
|
||||
uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
|
||||
#argv = 'lumiere'
|
||||
platform = 'darwin'
|
||||
filesystemencoding = 'utf-8'
|
||||
stdoutencoding = 'US-ASCII'
|
||||
dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
|
128
src/allmydata/util/stringutils.py
Normal file
128
src/allmydata/util/stringutils.py
Normal file
@ -0,0 +1,128 @@
|
||||
"""
|
||||
Functions used to convert inputs from whatever encoding used in the system to
|
||||
unicode and back.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import unicodedata
|
||||
from allmydata.util.assertutil import precondition
|
||||
from twisted.python import usage
|
||||
|
||||
def get_term_encoding():
|
||||
"""
|
||||
Returns expected encoding for writing to the terminal and reading
|
||||
arguments from the command-line.
|
||||
"""
|
||||
|
||||
if sys.stdout.encoding == None:
|
||||
return 'ascii'
|
||||
else:
|
||||
return sys.stdout.encoding
|
||||
|
||||
def argv_to_unicode(s):
|
||||
"""
|
||||
Decode given argv element to unicode.
|
||||
"""
|
||||
# Try to decode the command-line argument with the encoding returned by
|
||||
# get_term_encoding(), if this fails print an error message to the user.
|
||||
|
||||
precondition(isinstance(s, str), s)
|
||||
|
||||
try:
|
||||
return unicode(s, get_term_encoding())
|
||||
except UnicodeDecodeError:
|
||||
raise usage.UsageError("Argument '%s' cannot be decoded as %s." %
|
||||
(s, get_term_encoding()))
|
||||
|
||||
def unicode_to_url(s):
|
||||
"""
|
||||
Encode an unicode object used in an URL.
|
||||
"""
|
||||
# According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded.
|
||||
|
||||
precondition(isinstance(s, unicode), s)
|
||||
return s.encode('utf-8')
|
||||
|
||||
def unicode_to_stdout(s):
|
||||
"""
|
||||
Encode an unicode object for representation on stdout.
|
||||
"""
|
||||
|
||||
precondition(isinstance(s, unicode), s)
|
||||
return s.encode(get_term_encoding(), 'replace')
|
||||
|
||||
def unicode_platform():
|
||||
"""
|
||||
Does the current platform handle Unicode filenames natively ?
|
||||
"""
|
||||
|
||||
return sys.platform in ('win32', 'darwin')
|
||||
|
||||
class FilenameEncodingError(Exception):
|
||||
"""
|
||||
Filename cannot be encoded using the current encoding of your filesystem
|
||||
(%s). Please configure your locale correctly or rename this file.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def listdir_unicode_unix(path):
|
||||
"""
|
||||
This function emulates an Unicode API under Unix similar to one available
|
||||
under Windows or MacOS X.
|
||||
|
||||
If badly encoded filenames are encountered, an exception is raised.
|
||||
"""
|
||||
precondition(isinstance(path, unicode), path)
|
||||
|
||||
encoding = sys.getfilesystemencoding()
|
||||
try:
|
||||
byte_path = path.encode(encoding)
|
||||
except UnicodeEncodeError:
|
||||
raise FilenameEncodingError(path)
|
||||
|
||||
try:
|
||||
return [unicode(fn, encoding) for fn in os.listdir(byte_path)]
|
||||
except UnicodeDecodeError:
|
||||
raise FilenameEncodingError(fn)
|
||||
|
||||
def listdir_unicode(path, encoding = None):
|
||||
"""
|
||||
Wrapper around listdir() which provides safe access to the convenient
|
||||
Unicode API even under Unix.
|
||||
"""
|
||||
|
||||
precondition(isinstance(path, unicode), path)
|
||||
|
||||
# On Windows and MacOS X, the Unicode API is used
|
||||
if unicode_platform():
|
||||
dirlist = os.listdir(path)
|
||||
|
||||
# On other platforms (ie. Unix systems), the byte-level API is used
|
||||
else:
|
||||
dirlist = listdir_unicode_unix(path)
|
||||
|
||||
# Normalize the resulting unicode filenames
|
||||
#
|
||||
# This prevents different OS from generating non-equal unicode strings for
|
||||
# the same filename representation
|
||||
return [unicodedata.normalize('NFC', fname) for fname in dirlist]
|
||||
|
||||
def open_unicode(path, mode='r'):
|
||||
"""
|
||||
Wrapper around open() which provides safe access to the convenient Unicode
|
||||
API even under Unix.
|
||||
"""
|
||||
|
||||
precondition(isinstance(path, unicode), path)
|
||||
|
||||
if unicode_platform():
|
||||
return open(path, mode)
|
||||
else:
|
||||
encoding = sys.getfilesystemencoding()
|
||||
|
||||
try:
|
||||
return open(path.encode(encoding), mode)
|
||||
except UnicodeEncodeError:
|
||||
raise FilenameEncodingError(path)
|
Loading…
x
Reference in New Issue
Block a user