Generate and consumer the new structure properly

2025-04-05 09:59:24 +00:00 · 2023-01-03 19:22:38 -05:00 · 2023-01-03 19:22:38 -05:00 · fb70ba1867
commit fb70ba1867
parent ca00adf2b4
3 changed files with 203 additions and 2068 deletions
--- a/integration/test_vectors.py
+++ b/integration/test_vectors.py
@ -4,28 +4,30 @@ Verify certain results against test vectors with well-known results.

 from __future__ import annotations

-from time import sleep
 from typing import AsyncGenerator, Iterator
 from hashlib import sha256
-from itertools import product
+from itertools import starmap, product
 from yaml import safe_dump

-from attrs import frozen
-
 from pytest import mark
 from pytest_twisted import ensureDeferred

 from . import vectors
-from .util import reconfigure, upload, asyncfoldr, insert, TahoeProcess
+from .util import reconfigure, upload, TahoeProcess

 def digest(bs: bytes) -> bytes:
+    """
+    Digest bytes to bytes.
+    """
    return sha256(bs).digest()


 def hexdigest(bs: bytes) -> str:
+    """
+    Digest bytes to text.
+    """
    return sha256(bs).hexdigest()

-
 # Just a couple convergence secrets.  The only thing we do with this value is
 # feed it into a tagged hash.  It certainly makes a difference to the output
 # but the hash should destroy any structure in the input so it doesn't seem
@ -35,7 +37,6 @@ CONVERGENCE_SECRETS = [
    digest(b"Hello world")[:16],
 ]

-
 # Exercise at least a handful of different sizes, trying to cover:
 #
 #  1. Some cases smaller than one "segment" (128k).
@ -51,87 +52,66 @@ CONVERGENCE_SECRETS = [

 SEGMENT_SIZE = 128 * 1024
 OBJECT_DESCRIPTIONS = [
-    (b"a", 1024),
-    (b"c", 4096),
-    (digest(b"foo"), SEGMENT_SIZE - 1),
-    (digest(b"bar"), SEGMENT_SIZE + 1),
-    (digest(b"baz"), SEGMENT_SIZE * 16 - 1),
-    (digest(b"quux"), SEGMENT_SIZE * 16 + 1),
-    (digest(b"foobar"), SEGMENT_SIZE * 64 - 1),
-    (digest(b"barbaz"), SEGMENT_SIZE * 64 + 1),
+    vectors.Sample(b"a", 1024),
+    vectors.Sample(b"c", 4096),
+    vectors.Sample(digest(b"foo"), SEGMENT_SIZE - 1),
+    vectors.Sample(digest(b"bar"), SEGMENT_SIZE + 1),
+    vectors.Sample(digest(b"baz"), SEGMENT_SIZE * 16 - 1),
+    vectors.Sample(digest(b"quux"), SEGMENT_SIZE * 16 + 1),
+    vectors.Sample(digest(b"foobar"), SEGMENT_SIZE * 64 - 1),
+    vectors.Sample(digest(b"barbaz"), SEGMENT_SIZE * 64 + 1),
 ]

-# CHK have a max of 256 shares.  SDMF / MDMF have a max of 255 shares!
-# Represent max symbolically and resolve it when we know what format we're
-# dealing with.
-MAX_SHARES = "max"
-
-# SDMF and MDMF encode share counts (N and k) into the share itself as an
-# unsigned byte.  They could have encoded (share count - 1) to fit the full
-# range supported by ZFEC into the unsigned byte - but they don't.  So 256 is
-# inaccessible to those formats and we set the upper bound at 255.
-MAX_SHARES_MAP = {
-    "chk": 256,
-    "sdmf": 255,
-    "mdmf": 255,
-}
-
 ZFEC_PARAMS = [
-    (1, 1),
-    (1, 3),
-    (2, 3),
-    (3, 10),
-    (71, 255),
-    (101, MAX_SHARES),
+    vectors.SeedParam(1, 1),
+    vectors.SeedParam(1, 3),
+    vectors.SeedParam(2, 3),
+    vectors.SeedParam(3, 10),
+    vectors.SeedParam(71, 255),
+    vectors.SeedParam(101, vectors.MAX_SHARES),
 ]

 FORMATS = [
    "chk",
-    "sdmf",
-    "mdmf",
+    # "sdmf",
+    # "mdmf",
 ]

-@mark.parametrize('convergence_idx', range(len(CONVERGENCE_SECRETS)))
-def test_convergence(convergence_idx):
+@mark.parametrize('convergence', CONVERGENCE_SECRETS)
+def test_convergence(convergence):
    """
    Convergence secrets are 16 bytes.
    """
-    convergence = CONVERGENCE_SECRETS[convergence_idx]
    assert isinstance(convergence, bytes), "Convergence secret must be bytes"
    assert len(convergence) == 16, "Convergence secret must by 16 bytes"


-@mark.parametrize('params_idx', range(len(ZFEC_PARAMS)))
-@mark.parametrize('convergence_idx', range(len(CONVERGENCE_SECRETS)))
-@mark.parametrize('data_idx', range(len(OBJECT_DESCRIPTIONS)))
-@mark.parametrize('fmt_idx', range(len(FORMATS)))
+@mark.parametrize('seed_params', ZFEC_PARAMS)
+@mark.parametrize('convergence', CONVERGENCE_SECRETS)
+@mark.parametrize('seed_data', OBJECT_DESCRIPTIONS)
+@mark.parametrize('fmt', FORMATS)
@ensureDeferred
-async def test_capability(reactor, request, alice, params_idx, convergence_idx, data_idx, fmt_idx):
+async def test_capability(reactor, request, alice, seed_params, convergence, seed_data, fmt):
    """
    The capability that results from uploading certain well-known data
    with certain well-known parameters results in exactly the previously
    computed value.
    """
-    case = load_case(
-        params_idx,
-        convergence_idx,
-        data_idx,
-        fmt_idx,
-    )
+    case = vectors.Case(seed_params, convergence, seed_data, fmt)

    # rewrite alice's config to match params and convergence
-    await reconfigure(reactor, request, alice, (1,) + case.params, case.convergence)
+    await reconfigure(reactor, request, alice, (1, case.params.required, case.params.total), case.convergence)

    # upload data in the correct format
    actual = upload(alice, case.fmt, case.data)

    # compare the resulting cap to the expected result
-    expected = vectors.capabilities["vector"][case.key]
+    expected = vectors.capabilities[case]
    assert actual == expected


@ensureDeferred
-async def skiptest_generate(reactor, request, alice):
+async def test_generate(reactor, request, alice):
    """
    This is a helper for generating the test vectors.

@ -141,27 +121,34 @@ async def skiptest_generate(reactor, request, alice):
    to run against the results produced originally, not a possibly
    ever-changing set of outputs.
    """
-    space = product(
-        range(len(ZFEC_PARAMS)),
-        range(len(CONVERGENCE_SECRETS)),
-        range(len(OBJECT_DESCRIPTIONS)),
-        range(len(FORMATS)),
-    )
-    results = await asyncfoldr(
-        generate(reactor, request, alice, space),
-        insert,
-        {},
-    )
+    space = starmap(vectors.Case, product(
+        ZFEC_PARAMS,
+        CONVERGENCE_SECRETS,
+        OBJECT_DESCRIPTIONS,
+        FORMATS,
+    ))
+    results = generate(reactor, request, alice, space)
    with vectors.DATA_PATH.open("w") as f:
        f.write(safe_dump({
-            "version": "2022-12-26",
-            "params": {
-                "zfec": ZFEC_PARAMS,
-                "convergence": CONVERGENCE_SECRETS,
-                "objects": OBJECT_DESCRIPTIONS,
-                "formats": FORMATS,
-            },
-            "vector": results,
+            "version": "2023-01-03",
+            "vector": [
+                {
+                    "convergence": vectors.encode_bytes(case.convergence),
+                    "format": case.fmt,
+                    "sample": {
+                        "seed": vectors.encode_bytes(case.seed_data.seed),
+                        "length": case.seed_data.length,
+                    },
+                    "zfec": {
+                        "segmentSize": SEGMENT_SIZE,
+                        "required": case.seed_params.required,
+                        "total": case.seed_params.total,
+                    },
+                    "expected": cap,
+                }
+                async for (case, cap)
+                in results
+            ],
        }))


@ -169,8 +156,8 @@ async def generate(
        reactor,
        request,
        alice: TahoeProcess,
-        space: Iterator[int, int, int, int],
-) -> AsyncGenerator[tuple[str, str], None]:
+        cases: Iterator[vectors.Case],
+) -> AsyncGenerator[[vectors.Case, str], None]:
    """
    Generate all of the test vectors using the given node.

@ -184,79 +171,21 @@ async def generate(

    :param alice: The Tahoe-LAFS node to use to generate the test vectors.

-    :param space: An iterator of coordinates in the test vector space for
-       which to generate values.  The elements of each tuple give indexes into
-       ZFEC_PARAMS, CONVERGENCE_SECRETS, OBJECT_DESCRIPTIONS, and FORMATS.
+    :param case: The inputs for which to generate a value.

-    :return: The yield values are two-tuples describing a test vector.  The
-        first element is a string describing a case and the second element is
-        the capability for that case.
+    :return: The capability for the case.
    """
    # Share placement doesn't affect the resulting capability.  For maximum
    # reliability of this generator, be happy if we can put shares anywhere
    happy = 1
-    node_key = (None, None)
-    for params_idx, secret_idx, data_idx, fmt_idx in space:
-        case = load_case(params_idx, secret_idx, data_idx, fmt_idx)
-        if node_key != (case.params, case.convergence):
-            await reconfigure(reactor, request, alice, (happy,) + case.params, case.convergence)
-            node_key = (case.params, case.convergence)
+    for case in cases:
+        await reconfigure(
+            reactor,
+            request,
+            alice,
+            (happy, case.params.required, case.params.total),
+            case.convergence
+        )

        cap = upload(alice, case.fmt, case.data)
-        yield case.key, cap
-
-
-def key(params: int, secret: int, data: int, fmt: int) -> str:
-    """
-    Construct the key describing the case defined by the given parameters.
-
-    The parameters are indexes into the test data for a certain case.
-
-    :return: A distinct string for the given inputs.
-    """
-    return f"{params}-{secret}-{data}-{fmt}"
-
-
-def stretch(seed: bytes, size: int) -> bytes:
-    """
-    Given a simple description of a byte string, return the byte string
-    itself.
-    """
-    assert isinstance(seed, bytes)
-    assert isinstance(size, int)
-    assert size > 0
-    assert len(seed) > 0
-
-    multiples = size // len(seed) + 1
-    return (seed * multiples)[:size]
-
-
-def load_case(
-        params_idx: int,
-        convergence_idx: int,
-        data_idx: int,
-        fmt_idx: int
-) -> Case:
-    """
-    :return:
-    """
-    params = ZFEC_PARAMS[params_idx]
-    fmt = FORMATS[fmt_idx]
-    convergence = CONVERGENCE_SECRETS[convergence_idx]
-    data = stretch(*OBJECT_DESCRIPTIONS[data_idx])
-    if params[1] == MAX_SHARES:
-        params = (params[0], MAX_SHARES_MAP[fmt])
-    k = key(params_idx, convergence_idx, data_idx, fmt_idx)
-    return Case(k, fmt, params, convergence, data)
-
-
-@frozen
-class Case:
-    """
-    Represent one case for which we want/have a test vector.
-    """
-    key: str
-    fmt: str
-    params: tuple[int, int]
-    convergence: bytes
-    data: bytes
+        yield case, cap
--- a/integration/test_vectors.yaml
+++ b/integration/test_vectors.yaml
--- a/integration/vectors.py
+++ b/integration/vectors.py
@ -1,18 +1,144 @@
 """
 A module that loads pre-generated test vectors.

-:ivar CHK_PATH: The path of the file containing CHK test vectors.
+:ivar DATA_PATH: The path of the file containing test vectors.

-:ivar chk: The CHK test vectors.
+:ivar capabilities: The CHK test vectors.
 """

+from __future__ import annotations
+
+from typing import TextIO
+from attrs import frozen
 from yaml import safe_load
 from pathlib import Path
+from base64 import b64encode, b64decode

 DATA_PATH: Path = Path(__file__).parent / "test_vectors.yaml"

+@frozen
+class Sample:
+    """
+    Some instructions for building a long byte string.
+
+    :ivar seed: Some bytes to repeat some times to produce the string.
+    :ivar length: The length of the desired byte string.
+    """
+    seed: bytes
+    length: int
+
+@frozen
+class Param:
+    """
+    Some ZFEC parameters.
+    """
+    required: int
+    total: int
+
+# CHK have a max of 256 shares.  SDMF / MDMF have a max of 255 shares!
+# Represent max symbolically and resolve it when we know what format we're
+# dealing with.
+MAX_SHARES = "max"
+
+# SDMF and MDMF encode share counts (N and k) into the share itself as an
+# unsigned byte.  They could have encoded (share count - 1) to fit the full
+# range supported by ZFEC into the unsigned byte - but they don't.  So 256 is
+# inaccessible to those formats and we set the upper bound at 255.
+MAX_SHARES_MAP = {
+    "chk": 256,
+    "sdmf": 255,
+    "mdmf": 255,
+}
+
+@frozen
+class SeedParam:
+    """
+    Some ZFEC parameters, almost.
+
+    :ivar required: The number of required shares.
+
+    :ivar total: Either the number of total shares or the constant
+        ``MAX_SHARES`` to indicate that the total number of shares should be
+        the maximum number supported by the object format.
+    """
+    required: int
+    total: int | str
+
+    def realize(self, max_total: int) -> Param:
+        """
+        Create a ``Param`` from this object's values, possibly
+        substituting the given real value for total if necessary.
+
+        :param max_total: The value to use to replace ``MAX_SHARES`` if
+            necessary.
+        """
+        if self.total == MAX_SHARES:
+            return Param(self.required, max_total)
+        return Param(self.required, self.total)
+
+@frozen
+class Case:
+    """
+    Represent one case for which we want/have a test vector.
+    """
+    seed_params: Param
+    convergence: bytes
+    seed_data: Sample
+    fmt: str
+
+    @property
+    def data(self):
+        return stretch(self.seed_data.seed, self.seed_data.length)
+
+    @property
+    def params(self):
+        return self.seed_params.realize(MAX_SHARES_MAP[self.fmt])
+
+
+def encode_bytes(b: bytes) -> str:
+    """
+    Base64 encode some bytes to text so they are representable in JSON.
+    """
+    return b64encode(b).decode("ascii")
+
+
+def decode_bytes(b: str) -> bytes:
+    """
+    Base64 decode some text to bytes.
+    """
+    return b64decode(b.encode("ascii"))
+
+
+def stretch(seed: bytes, size: int) -> bytes:
+    """
+    Given a simple description of a byte string, return the byte string
+    itself.
+    """
+    assert isinstance(seed, bytes)
+    assert isinstance(size, int)
+    assert size > 0
+    assert len(seed) > 0
+
+    multiples = size // len(seed) + 1
+    return (seed * multiples)[:size]
+
+
+def load_capabilities(f: TextIO) -> dict[Case, str]:
+    data = safe_load(f)
+    return {
+        Case(
+            seed_params=SeedParam(case["zfec"]["required"], case["zfec"]["total"]),
+            convergence=decode_bytes(case["convergence"]),
+            seed_data=Sample(decode_bytes(case["sample"]["seed"]), case["sample"]["length"]),
+            fmt=case["format"],
+        ): case["expected"]
+        for case
+        in data["vector"]
+    }
+
+
 try:
    with DATA_PATH.open() as f:
-        capabilities: dict[str, str] = safe_load(f)
+        capabilities: dict[Case, str] = load_capabilities(f)
 except FileNotFoundError:
    capabilities = {}