ZeroTierOne/zeroidc/vendor/memchr/scripts/make-byte-frequency-table

75 lines
2.0 KiB
Python
Executable File

#!/usr/bin/env python
# This does simple normalized frequency analysis on UTF-8 encoded text. The
# result of the analysis is translated to a ranked list, where every byte is
# assigned a rank. This list is written to src/freqs.rs.
#
# Currently, the frequencies are generated from the following corpuses:
#
# * The CIA world fact book
# * The source code of rustc
# * Septuaginta
from __future__ import absolute_import, division, print_function
import argparse
from collections import Counter
import sys
preamble = '''
// NOTE: The following code was generated by "scripts/frequencies.py", do not
// edit directly
'''.lstrip()
def eprint(*args, **kwargs):
kwargs['file'] = sys.stderr
print(*args, **kwargs)
def main():
p = argparse.ArgumentParser()
p.add_argument('corpus', metavar='FILE', nargs='+')
args = p.parse_args()
# Get frequency counts of each byte.
freqs = Counter()
for i in range(0, 256):
freqs[i] = 0
eprint('reading entire corpus into memory')
corpus = []
for fpath in args.corpus:
corpus.append(open(fpath, 'rb').read())
eprint('computing byte frequencies')
for c in corpus:
for byte in c:
freqs[byte] += 1.0 / float(len(c))
eprint('writing Rust code')
# Get the rank of each byte. A lower rank => lower relative frequency.
rank = [0] * 256
for i, (byte, _) in enumerate(freqs.most_common()):
# print(byte)
rank[byte] = 255 - i
# Forcefully set the highest rank possible for bytes that start multi-byte
# UTF-8 sequences. The idea here is that a continuation byte will be more
# discerning in a homogenous haystack.
for byte in range(0xC0, 0xFF + 1):
rank[byte] = 255
# Now write Rust.
olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
for byte in range(256):
olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
olines.append('];')
print(preamble)
print('\n'.join(olines))
if __name__ == '__main__':
main()