diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 000000000..57f0be071
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1,8 @@
+"""pytest-based end-to-end benchmarks of Tahoe-LAFS.
+
+Usage:
+
+$ pytest benchmark --number-of-nodes=3
+
+It's possible to pass --number-of-nodes multiple times.
+"""
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
new file mode 100644
index 000000000..381bd5670
--- /dev/null
+++ b/benchmarks/conftest.py
@@ -0,0 +1,126 @@
+"""
+pytest infrastructure for benchmarks.
+
+The number of nodes is parameterized via a --number-of-nodes CLI option added
+to pytest.
+"""
+
+from shutil import which, rmtree
+from tempfile import mkdtemp
+from contextlib import contextmanager
+from time import time
+
+import pytest
+import pytest_twisted
+
+from twisted.internet import reactor
+from twisted.internet.defer import DeferredList, succeed
+
+from allmydata.util.iputil import allocate_tcp_port
+
+from integration.grid import Client, create_grid, create_flog_gatherer
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--number-of-nodes",
+        action="append",
+        default=[],
+        type=int,
+        help="list of number_of_nodes to benchmark against",
+    )
+    # Required to be compatible with integration.util code that we indirectly
+    # depend on, but also might be useful.
+    parser.addoption(
+        "--force-foolscap",
+        action="store_true",
+        default=False,
+        dest="force_foolscap",
+        help=(
+            "If set, force Foolscap only for the storage protocol. "
+            + "Otherwise HTTP will be used."
+        ),
+    )
+
+
+def pytest_generate_tests(metafunc):
+    # Make number_of_nodes accessible as a parameterized fixture:
+    if "number_of_nodes" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "number_of_nodes",
+            metafunc.config.getoption("number_of_nodes"),
+            scope="session",
+        )
+
+
+def port_allocator():
+    port = allocate_tcp_port()
+    return succeed(port)
+
+
+@pytest.fixture(scope="session")
+def grid(request):
+    """
+    Provides a new Grid with a single Introducer and flog-gathering process.
+
+    Notably does _not_ provide storage servers; use the storage_nodes
+    fixture if your tests need a Grid that can be used for puts / gets.
+    """
+    tmp_path = mkdtemp(prefix="tahoe-benchmark")
+    request.addfinalizer(lambda: rmtree(tmp_path))
+    flog_binary = which("flogtool")
+    flog_gatherer = pytest_twisted.blockon(
+        create_flog_gatherer(reactor, request, tmp_path, flog_binary)
+    )
+    g = pytest_twisted.blockon(
+        create_grid(reactor, request, tmp_path, flog_gatherer, port_allocator)
+    )
+    return g
+
+
+@pytest.fixture(scope="session")
+def storage_nodes(grid, number_of_nodes):
+    nodes_d = []
+    for _ in range(number_of_nodes):
+        nodes_d.append(grid.add_storage_node())
+
+    nodes_status = pytest_twisted.blockon(DeferredList(nodes_d))
+    for ok, value in nodes_status:
+        assert ok, "Storage node creation failed: {}".format(value)
+    return grid.storage_servers
+
+
+@pytest.fixture(scope="session")
+def client_node(request, grid, storage_nodes, number_of_nodes) -> Client:
+    """
+    Create a grid client node with number of shares matching number of nodes.
+    """
+    client_node = pytest_twisted.blockon(
+        grid.add_client(
+            "client_node",
+            needed=number_of_nodes,
+            happy=number_of_nodes,
+            total=number_of_nodes,
+        )
+    )
+    print(f"Client node pid: {client_node.process.transport.pid}")
+    return client_node
+
+
+class Benchmarker:
+    """Keep track of benchmarking results."""
+
+    @contextmanager
+    def record(self, name, **parameters):
+        """Record the timing of running some code, if it succeeds."""
+        start = time()
+        yield
+        elapsed = time() - start
+        # For now we just print the outcome:
+        parameters = " ".join(f"{k}={v}" for (k, v) in parameters.items())
+        print(f"BENCHMARK RESULT: {name} {parameters} elapsed {elapsed} secs")
+
+
+@pytest.fixture(scope="session")
+def tahoe_benchmarker():
+    return Benchmarker()
diff --git a/benchmarks/test_cli.py b/benchmarks/test_cli.py
new file mode 100644
index 000000000..94eca4475
--- /dev/null
+++ b/benchmarks/test_cli.py
@@ -0,0 +1,48 @@
+"""Benchmarks for minimal `tahoe` CLI interactions."""
+
+from subprocess import Popen, PIPE
+
+import pytest
+
+from integration.util import cli
+
+
+@pytest.fixture(scope="session")
+def cli_alias(client_node):
+    cli(client_node.process, "create-alias", "cli")
+
+
+def test_get_put_one_file(
+    client_node, cli_alias, tmp_path, tahoe_benchmarker, number_of_nodes
+):
+    """
+    Upload a file with ``tahoe put`` and then download it with ``tahoe get``,
+    measuring the latency of both operations.
+    """
+    file_size = 1000  # parameterize later on
+    file_path = tmp_path / "file"
+    DATA = b"0123456789" * (file_size // 10)
+    with file_path.open("wb") as f:
+        f.write(DATA)
+
+    with tahoe_benchmarker.record(
+        "cli-put-file", file_size=file_size, number_of_nodes=number_of_nodes
+    ):
+        cli(client_node.process, "put", str(file_path), "cli:tostdout")
+
+    with tahoe_benchmarker.record(
+        "cli-get-file", file_size=file_size, number_of_nodes=number_of_nodes
+    ):
+        p = Popen(
+            [
+                "tahoe",
+                "--node-directory",
+                client_node.process.node_dir,
+                "get",
+                "cli:tostdout",
+                "-",
+            ],
+            stdout=PIPE,
+        )
+        assert p.stdout.read() == DATA
+        assert p.wait() == 0
diff --git a/benchmarks/upload_download.py b/benchmarks/upload_download.py
deleted file mode 100644
index 3dfa63336..000000000
--- a/benchmarks/upload_download.py
+++ /dev/null
@@ -1,138 +0,0 @@
-"""
-First attempt at benchmarking uploads and downloads.
-
-To run:
-
-$ pytest benchmarks/upload_download.py -s -v -Wignore
-
-To add latency of e.g. 60ms on Linux:
-
-$ tc qdisc add dev lo root netem delay 30ms
-
-To reset:
-
-$ tc qdisc del dev lo root netem
-
-Frequency scaling can spoil the results.
-To see the range of frequency scaling on a Linux system:
-
-$ cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_available_frequencies
-
-And to pin the CPU frequency to the lower bound found in these files:
-
-$ sudo cpupower frequency-set -f <lowest available frequency>
-
-TODO Parameterization (pytest?)
-
-    - Foolscap vs not foolscap
-
-    - Number of nodes
-
-    - Data size
-
-    - Number of needed/happy/total shares.
-
-CAVEATS: The goal here isn't a realistic benchmark, or a benchmark that will be
-measured over time, or is expected to be maintainable over time.  This is just
-a quick and easy way to measure the speed of certain operations, compare HTTP
-and Foolscap, and see the short-term impact of changes.
-
-Eventually this will be replaced by a real benchmark suite that can be run over
-time to measure something more meaningful.
-"""
-
-from time import time, process_time
-from contextlib import contextmanager
-from tempfile import mkdtemp
-import os
-
-from twisted.trial.unittest import TestCase
-from twisted.internet.defer import gatherResults
-
-from allmydata.util.deferredutil import async_to_deferred
-from allmydata.util.consumer import MemoryConsumer
-from allmydata.test.common_system import SystemTestMixin
-from allmydata.immutable.upload import Data as UData
-from allmydata.mutable.publish import MutableData
-
-
-@contextmanager
-def timeit(name):
-    start = time()
-    start_cpu = process_time()
-    try:
-        yield
-    finally:
-        print(
-            f"{name}: {time() - start:.3f} elapsed, {process_time() - start_cpu:.3f} CPU"
-        )
-
-
-class ImmutableBenchmarks(SystemTestMixin, TestCase):
-    """Benchmarks for immutables."""
-
-    # To use Foolscap, change to True:
-    FORCE_FOOLSCAP_FOR_STORAGE = False
-
-    # Don't reduce HTTP connection timeouts, that messes up the more aggressive
-    # benchmarks:
-    REDUCE_HTTP_CLIENT_TIMEOUT = False
-
-    @async_to_deferred
-    async def setUp(self):
-        SystemTestMixin.setUp(self)
-        self.basedir = os.path.join(mkdtemp(), "nodes")
-
-        # 2 nodes
-        await self.set_up_nodes(2)
-
-        # 1 share
-        for c in self.clients:
-            c.encoding_params["k"] = 1
-            c.encoding_params["happy"] = 1
-            c.encoding_params["n"] = 1
-
-        print()
-
-    @async_to_deferred
-    async def test_upload_and_download_immutable(self):
-        # To test larger files, change this:
-        DATA = b"Some data to upload\n" * 10
-
-        for i in range(5):
-            # 1. Upload:
-            with timeit("  upload"):
-                uploader = self.clients[0].getServiceNamed("uploader")
-                results = await uploader.upload(UData(DATA, convergence=None))
-
-            # 2. Download:
-            with timeit("download"):
-                uri = results.get_uri()
-                node = self.clients[1].create_node_from_uri(uri)
-                mc = await node.read(MemoryConsumer(), 0, None)
-                self.assertEqual(b"".join(mc.chunks), DATA)
-
-    @async_to_deferred
-    async def test_upload_and_download_mutable(self):
-        # To test larger files, change this:
-        DATA = b"Some data to upload\n" * 10
-
-        for i in range(5):
-            # 1. Upload:
-            with timeit("  upload"):
-                result = await self.clients[0].create_mutable_file(MutableData(DATA))
-
-            # 2. Download:
-            with timeit("download"):
-                data = await result.download_best_version()
-                self.assertEqual(data, DATA)
-
-    @async_to_deferred
-    async def test_upload_mutable_in_parallel(self):
-        # To test larger files, change this:
-        DATA = b"Some data to upload\n" * 1_000_000
-        with timeit("  upload"):
-            await gatherResults([
-                self.clients[0].create_mutable_file(MutableData(DATA))
-                for _ in range(20)
-            ])
diff --git a/integration/util.py b/integration/util.py
index 85a2fc3ee..59be528dc 100644
--- a/integration/util.py
+++ b/integration/util.py
@@ -240,7 +240,7 @@ def _tahoe_runner_optional_coverage(proto, reactor, request, other_args):
     allmydata.scripts.runner` and `other_args`, optionally inserting a
     `--coverage` option if the `request` indicates we should.
     """
-    if request.config.getoption('coverage'):
+    if request.config.getoption('coverage', False):
         args = [sys.executable, '-b', '-m', 'coverage', 'run', '-m', 'allmydata.scripts.runner', '--coverage']
     else:
         args = [sys.executable, '-b', '-m', 'allmydata.scripts.runner']
diff --git a/newsfragments/4060.feature b/newsfragments/4060.feature
new file mode 100644
index 000000000..5eea8134d
--- /dev/null
+++ b/newsfragments/4060.feature
@@ -0,0 +1 @@
+Started work on a new end-to-end benchmarking framework.
\ No newline at end of file
diff --git a/tox.ini b/tox.ini
index 67a089b0c..a736a7af1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -109,7 +109,7 @@ passenv = HOME
 setenv =
 	 # If no positional arguments are given, try to run the checks on the
 	 # entire codebase, including various pieces of supporting code.
-	 DEFAULT_FILES=src integration static misc setup.py
+	 DEFAULT_FILES=src integration benchmarks static misc setup.py
 commands =
          ruff check {posargs:{env:DEFAULT_FILES}}
          python misc/coding_tools/check-umids.py {posargs:{env:DEFAULT_FILES}}