tahoe-lafs/src/allmydata/util/base32.py

"""
Base32 encoding.
"""

from builtins import bytes
from past.builtins import chr as byteschr

import base64

from allmydata.util.assertutil import precondition

rfc3548_alphabet = b"abcdefghijklmnopqrstuvwxyz234567" # RFC3548 standard used by Gnutella, Content-Addressable Web, THEX, Bitzi, Web-Calculus...
chars = rfc3548_alphabet

vals = b''.join(map(chr, range(32)))
c2vtranstable = bytes.maketrans(chars, vals)
v2ctranstable = bytes.maketrans(vals, chars)
identitytranstable = bytes.maketrans(b'', b'')

def _get_trailing_chars_without_lsbs(N, d):
    """
    @return: a list of chars that can legitimately appear in the last place when the least significant N bits are ignored.
    """
    s = []
    if N < 4:
        s.extend(_get_trailing_chars_without_lsbs(N+1, d=d))
    i = 0
    while i < len(chars):
        if not d.has_key(i):
            d[i] = None
            s.append(chars[i])
        i = i + 2**N
    return s

def get_trailing_chars_without_lsbs(N):
    precondition((N >= 0) and (N < 5), "N is required to be > 0 and < len(chars).", N=N)
    if N == 0:
        return chars
    d = {}
    return b''.join(_get_trailing_chars_without_lsbs(N, d=d))

BASE32CHAR =b'['+get_trailing_chars_without_lsbs(0)+b']'
BASE32CHAR_4bits =b'['+get_trailing_chars_without_lsbs(1)+b']'
BASE32CHAR_3bits =b'['+get_trailing_chars_without_lsbs(2)+b']'
BASE32CHAR_2bits =b'['+get_trailing_chars_without_lsbs(3)+b']'
BASE32CHAR_1bits =b'['+get_trailing_chars_without_lsbs(4)+b']'
BASE32STR_1byte = BASE32CHAR+BASE32CHAR_3bits
BASE32STR_2bytes = BASE32CHAR+b'{3}'+BASE32CHAR_1bits
BASE32STR_3bytes = BASE32CHAR+b'{4}'+BASE32CHAR_4bits
BASE32STR_4bytes = BASE32CHAR+b'{6}'+BASE32CHAR_2bits
BASE32STR_anybytes =b'((?:%s{8})*' % (BASE32CHAR,) + b"(?:|%s|%s|%s|%s))" % (BASE32STR_1byte, BASE32STR_2bytes, BASE32STR_3bytes, BASE32STR_4bytes)

def b2a(os):
    """
    @param os the data to be encoded (a string)

    @return the contents of os in base-32 encoded form
    """
    return base64.b32encode(os).rstrip(b"=").lower()

def b2a_or_none(os):
    if os is not None:
        return b2a(os)

# b2a() uses the minimal number of quintets sufficient to encode the binary
# input.  It just so happens that the relation is like this (everything is
# modulo 40 bits).
# num_qs = NUM_OS_TO_NUM_QS[num_os]
NUM_OS_TO_NUM_QS=(0, 2, 4, 5, 7,)

# num_os = NUM_QS_TO_NUM_OS[num_qs], but if not NUM_QS_LEGIT[num_qs] then
# there is *no* number of octets which would have resulted in this number of
# quintets, so either the encoded string has been mangled (truncated) or else
# you were supposed to decode it with a2b_l() (which means you were supposed
# to know the actual length of the encoded data).

NUM_QS_TO_NUM_OS=(0, 1, 1, 2, 2, 3, 3, 4)
NUM_QS_LEGIT=(1, 0, 1, 0, 1, 1, 0, 1,)
NUM_QS_TO_NUM_BITS=tuple(map(lambda x: x*8, NUM_QS_TO_NUM_OS))

# A fast way to determine whether a given string *could* be base-32 encoded data, assuming that the
# original data had 8K bits for a positive integer K.
# The boolean value of s8[len(s)%8][ord(s[-1])], where s is the possibly base-32 encoded string
# tells whether the final character is reasonable.
def add_check_array(cs, sfmap):
    checka=[0] * 256
    for c in cs:
        checka[ord(c)] = 1
    sfmap.append(tuple(checka))

def init_s8():
    s8 = []
    add_check_array(chars, s8)
    for lenmod8 in (1, 2, 3, 4, 5, 6, 7,):
        if NUM_QS_LEGIT[lenmod8]:
            add_check_array(get_trailing_chars_without_lsbs(4-(NUM_QS_TO_NUM_BITS[lenmod8]%5)), s8)
        else:
            add_check_array('', s8)
    return tuple(s8)
s8 = init_s8()

def could_be_base32_encoded(s, s8=s8, tr=bytes.translate, identitytranstable=identitytranstable, chars=chars):
    precondition(isinstance(s, bytes), s)
    if s == b'':
        return True
    return s8[len(s)%8][ord(s[-1])] and not tr(s, identitytranstable, chars)

def a2b(cs):
    """
    @param cs the base-32 encoded data (a string)
    """
    precondition(could_be_base32_encoded(cs), "cs is required to be possibly base32 encoded data.", cs=cs)
    precondition(isinstance(cs, bytes), cs)

    cs = cs.upper()
    # Add padding back, to make Python's base64 module happy:
    while (len(cs) * 5) % 8 != 0:
        cs += b"="
    return base64.b32decode(cs)


__all__ = ["b2a", "a2b", "b2a_or_none", "BASE32CHAR_3bits", "BASE32CHAR_1bits", "BASE32CHAR", "BASE32STR_anybytes", "could_be_base32_encoded"]
Rip out a whole bunch of code, either deleting or replacing with stdlib's base32 implementation. 2020-07-14 11:45:05 -04:00			`"""`
			`Base32 encoding.`
			`"""`

Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`from builtins import bytes`
			`from past.builtins import chr as byteschr`
Rip out a whole bunch of code, either deleting or replacing with stdlib's base32 implementation. 2020-07-14 11:45:05 -04:00
Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`import base64`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
Change relative imports to absolute 2010-02-26 01:14:33 -07:00			`from allmydata.util.assertutil import precondition`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`rfc3548_alphabet = b"abcdefghijklmnopqrstuvwxyz234567" # RFC3548 standard used by Gnutella, Content-Addressable Web, THEX, Bitzi, Web-Calculus...`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00			`chars = rfc3548_alphabet`

Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`vals = b''.join(map(chr, range(32)))`
			`c2vtranstable = bytes.maketrans(chars, vals)`
			`v2ctranstable = bytes.maketrans(vals, chars)`
			`identitytranstable = bytes.maketrans(b'', b'')`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
			`def _get_trailing_chars_without_lsbs(N, d):`
			`"""`
			`@return: a list of chars that can legitimately appear in the last place when the least significant N bits are ignored.`
			`"""`
			`s = []`
			`if N < 4:`
			`s.extend(_get_trailing_chars_without_lsbs(N+1, d=d))`
			`i = 0`
			`while i < len(chars):`
			`if not d.has_key(i):`
			`d[i] = None`
			`s.append(chars[i])`
			`i = i + 2**N`
			`return s`

			`def get_trailing_chars_without_lsbs(N):`
			`precondition((N >= 0) and (N < 5), "N is required to be > 0 and < len(chars).", N=N)`
			`if N == 0:`
			`return chars`
			`d = {}`
Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`return b''.join(_get_trailing_chars_without_lsbs(N, d=d))`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`BASE32CHAR =b'['+get_trailing_chars_without_lsbs(0)+b']'`
			`BASE32CHAR_4bits =b'['+get_trailing_chars_without_lsbs(1)+b']'`
			`BASE32CHAR_3bits =b'['+get_trailing_chars_without_lsbs(2)+b']'`
			`BASE32CHAR_2bits =b'['+get_trailing_chars_without_lsbs(3)+b']'`
			`BASE32CHAR_1bits =b'['+get_trailing_chars_without_lsbs(4)+b']'`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00			`BASE32STR_1byte = BASE32CHAR+BASE32CHAR_3bits`
Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`BASE32STR_2bytes = BASE32CHAR+b'{3}'+BASE32CHAR_1bits`
			`BASE32STR_3bytes = BASE32CHAR+b'{4}'+BASE32CHAR_4bits`
			`BASE32STR_4bytes = BASE32CHAR+b'{6}'+BASE32CHAR_2bits`
			`BASE32STR_anybytes =b'((?:%s{8})*' % (BASE32CHAR,) + b"(?:\|%s\|%s\|%s\|%s))" % (BASE32STR_1byte, BASE32STR_2bytes, BASE32STR_3bytes, BASE32STR_4bytes)`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
			`def b2a(os):`
			`"""`
			`@param os the data to be encoded (a string)`

			`@return the contents of os in base-32 encoded form`
			`"""`
Rip out a whole bunch of code, either deleting or replacing with stdlib's base32 implementation. 2020-07-14 11:45:05 -04:00			`return base64.b32encode(os).rstrip(b"=").lower()`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
			`def b2a_or_none(os):`
			`if os is not None:`
			`return b2a(os)`

			`# b2a() uses the minimal number of quintets sufficient to encode the binary`
			`# input. It just so happens that the relation is like this (everything is`
			`# modulo 40 bits).`
			`# num_qs = NUM_OS_TO_NUM_QS[num_os]`
			`NUM_OS_TO_NUM_QS=(0, 2, 4, 5, 7,)`

			`# num_os = NUM_QS_TO_NUM_OS[num_qs], but if not NUM_QS_LEGIT[num_qs] then`
			`# there is no number of octets which would have resulted in this number of`
			`# quintets, so either the encoded string has been mangled (truncated) or else`
			`# you were supposed to decode it with a2b_l() (which means you were supposed`
			`# to know the actual length of the encoded data).`

			`NUM_QS_TO_NUM_OS=(0, 1, 1, 2, 2, 3, 3, 4)`
			`NUM_QS_LEGIT=(1, 0, 1, 0, 1, 1, 0, 1,)`
			`NUM_QS_TO_NUM_BITS=tuple(map(lambda x: x*8, NUM_QS_TO_NUM_OS))`

			`# A fast way to determine whether a given string could be base-32 encoded data, assuming that the`
			`# original data had 8K bits for a positive integer K.`
			`# The boolean value of s8[len(s)%8][ord(s[-1])], where s is the possibly base-32 encoded string`
			`# tells whether the final character is reasonable.`
			`def add_check_array(cs, sfmap):`
			`checka=[0] * 256`
			`for c in cs:`
			`checka[ord(c)] = 1`
			`sfmap.append(tuple(checka))`

			`def init_s8():`
			`s8 = []`
			`add_check_array(chars, s8)`
			`for lenmod8 in (1, 2, 3, 4, 5, 6, 7,):`
			`if NUM_QS_LEGIT[lenmod8]:`
			`add_check_array(get_trailing_chars_without_lsbs(4-(NUM_QS_TO_NUM_BITS[lenmod8]%5)), s8)`
			`else:`
			`add_check_array('', s8)`
			`return tuple(s8)`
			`s8 = init_s8()`

Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`def could_be_base32_encoded(s, s8=s8, tr=bytes.translate, identitytranstable=identitytranstable, chars=chars):`
			`precondition(isinstance(s, bytes), s)`
			`if s == b'':`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00			`return True`
			`return s8[len(s)%8][ord(s[-1])] and not tr(s, identitytranstable, chars)`

			`def a2b(cs):`
			`"""`
			`@param cs the base-32 encoded data (a string)`
			`"""`
			`precondition(could_be_base32_encoded(cs), "cs is required to be possibly base32 encoded data.", cs=cs)`
Initial, manual steps of Python 3 port. 2020-07-14 11:53:40 -04:00			`precondition(isinstance(cs, bytes), cs)`
switch from base62 to base32 for storage indices, switch from z-base-32 to rfc 3548 base-32 for everything, separate out base32 encoding from idlib 2008-02-14 19:27:47 -07:00
Rip out a whole bunch of code, either deleting or replacing with stdlib's base32 implementation. 2020-07-14 11:45:05 -04:00			`cs = cs.upper()`
			`# Add padding back, to make Python's base64 module happy:`
			`while (len(cs) * 5) % 8 != 0:`
			`cs += b"="`
			`return base64.b32decode(cs)`
Document current public API. 2020-07-13 15:41:01 -04:00

Remove a2b_l and b2a_l from public API, to ease likely future switch to stdlib base64's implementation. 2020-07-13 16:12:53 -04:00			`__all__ = ["b2a", "a2b", "b2a_or_none", "BASE32CHAR_3bits", "BASE32CHAR_1bits", "BASE32CHAR", "BASE32STR_anybytes", "could_be_base32_encoded"]`