tahoe-lafs/src/allmydata/util/base62.py

"""
Base62 encoding.

Ported to Python 3.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from future.utils import PY2
if PY2:
    from builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min  # noqa: F401

if PY2:
    import string
    maketrans = string.maketrans
    translate = string.translate
else:
    maketrans = bytes.maketrans
    translate = bytes.translate

from past.builtins import chr as byteschr

from allmydata.util.mathutil import log_ceil, log_floor

chars = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

BASE62CHAR = b'[' + chars + b']'

vals = b''.join([byteschr(i) for i in range(62)])
c2vtranstable = maketrans(chars, vals)
v2ctranstable = maketrans(vals, chars)
identitytranstable = maketrans(chars, chars)

def b2a(os):
    """
    @param os the data to be encoded (as bytes)

    @return the contents of os in base-62 encoded form, as bytes
    """
    cs = b2a_l(os, len(os)*8)
    assert num_octets_that_encode_to_this_many_chars(len(cs)) == len(os), "%s != %s, numchars: %s" % (num_octets_that_encode_to_this_many_chars(len(cs)), len(os), len(cs))
    return cs

def b2a_l(os, lengthinbits):
    """
    @param os the data to be encoded (as bytes)
    @param lengthinbits the number of bits of data in os to be encoded

    b2a_l() will generate a base-62 encoded string big enough to encode
    lengthinbits bits.  So for example if os is 3 bytes long and lengthinbits is
    17, then b2a_l() will generate a 3-character- long base-62 encoded string
    (since 3 chars is sufficient to encode more than 2^17 values).  If os is 3
    bytes long and lengthinbits is 18 (or None), then b2a_l() will generate a
    4-character string (since 4 chars are required to hold 2^18 values).  Note
    that if os is 3 bytes long and lengthinbits is 17, the least significant 7
    bits of os are ignored.

    Warning: if you generate a base-62 encoded string with b2a_l(), and then someone else tries to
    decode it by calling a2b() instead of  a2b_l(), then they will (potentially) get a different
    string than the one you encoded!  So use b2a_l() only when you are sure that the encoding and
    decoding sides know exactly which lengthinbits to use.  If you do not have a way for the
    encoder and the decoder to agree upon the lengthinbits, then it is best to use b2a() and
    a2b().  The only drawback to using b2a() over b2a_l() is that when you have a number of
    bits to encode that is not a multiple of 8, b2a() can sometimes generate a base-62 encoded
    string that is one or two characters longer than necessary.

    @return the contents of os in base-62 encoded form, as bytes
    """
    # We call bytes() again for Python 2, to ensure literals are using future's
    # Python 3-compatible variant.
    os = [o for o in reversed(bytes(os))] # treat os as big-endian -- and we want to process the least-significant o first

    value = 0
    numvalues = 1 # the number of possible values that value could be
    for o in os:
        o *= numvalues
        value += o
        numvalues *= 256

    chars = []
    while numvalues > 0:
        chars.append(value % 62)
        value //= 62
        numvalues //= 62

    return translate(bytes([c for c in reversed(chars)]), v2ctranstable) # make it big-endian

def num_octets_that_encode_to_this_many_chars(numcs):
    return log_floor(62**numcs, 256)

def num_chars_that_this_many_octets_encode_to(numos):
    return log_ceil(256**numos, 62)

def a2b(cs):
    """
    @param cs the base-62 encoded data (a string)
    """
    return a2b_l(cs, num_octets_that_encode_to_this_many_chars(len(cs))*8)

def a2b_l(cs, lengthinbits):
    """
    @param lengthinbits the number of bits of data in encoded into cs

    a2b_l() will return a result just big enough to hold lengthinbits bits.  So
    for example if cs is 2 characters long (encoding between 5 and 12 bits worth
    of data) and lengthinbits is 8, then a2b_l() will return a string of length
    1 (since 1 byte is sufficient to store 8 bits), but if lengthinbits is 9,
    then a2b_l() will return a string of length 2.

    Please see the warning in the docstring of b2a_l() regarding the use of
    b2a() versus b2a_l().

    @return the data encoded in cs, as bytes
    """
    # We call bytes() again for Python 2, to ensure literals are using future's
    # Python 3-compatible variant.
    cs = [c for c in reversed(bytes(translate(cs, c2vtranstable)))] # treat cs as big-endian -- and we want to process the least-significant c first

    value = 0
    numvalues = 1 # the number of possible values that value could be
    for c in cs:
        c *= numvalues
        value += c
        numvalues *= 62

    numvalues = 2**lengthinbits
    result_bytes = []
    while numvalues > 1:
        result_bytes.append(value % 256)
        value //= 256
        numvalues //= 256

    return bytes([b for b in reversed(result_bytes)]) # make it big-endian