Merge pull request #1219 from meejah/3926.pid-time-reactor

3926: upgrade PID files to contain start-time, do file-locking and exist on Windows
2024-12-30 09:48:56 +00:00 · 2022-09-28 11:02:46 -06:00 · 2022-09-28 11:02:46 -06:00 · d2dd211420
commit d2dd211420
parent 8086e8c4f0 bef71978b6
11 changed files with 344 additions and 64 deletions
--- a/docs/check_running.py
+++ b/docs/check_running.py
@ -0,0 +1,47 @@
 import psutil
 import filelock
 def can_spawn_tahoe(pidfile):
    """
    Determine if we can spawn a Tahoe-LAFS for the given pidfile. That
    pidfile may be deleted if it is stale.
    :param pathlib.Path pidfile: the file to check, that is the Path
        to "running.process" in a Tahoe-LAFS configuration directory
    :returns bool: True if we can spawn `tahoe run` here
    """
    lockpath = pidfile.parent / (pidfile.name + ".lock")
    with filelock.FileLock(lockpath):
        try:
            with pidfile.open("r") as f:
                pid, create_time = f.read().strip().split(" ", 1)
        except FileNotFoundError:
            return True
        # somewhat interesting: we have a pidfile
        pid = int(pid)
        create_time = float(create_time)
        try:
            proc = psutil.Process(pid)
            # most interesting case: there _is_ a process running at the
            # recorded PID -- but did it just happen to get that PID, or
            # is it the very same one that wrote the file?
            if create_time == proc.create_time():
                # _not_ stale! another intance is still running against
                # this configuration
                return False
        except psutil.NoSuchProcess:
            pass
        # the file is stale
        pidfile.unlink()
        return True
 from pathlib import Path
 print("can spawn?", can_spawn_tahoe(Path("running.process")))
--- a/docs/running.rst
+++ b/docs/running.rst
@ -124,6 +124,35 @@ Tahoe-LAFS.
 .. _magic wormhole: https://magic-wormhole.io/
 Multiple Instances
 ------------------
 Running multiple instances against the same configuration directory isn't supported.
 This will lead to undefined behavior and could corrupt the configuration or state.
 We attempt to avoid this situation with a "pidfile"-style file in the config directory called ``running.process``.
 There may be a parallel file called ``running.process.lock`` in existence.
 The ``.lock`` file exists to make sure only one process modifies ``running.process`` at once.
 The lock file is managed by the `lockfile <https://pypi.org/project/lockfile/>`_ library.
 If you wish to make use of ``running.process`` for any reason you should also lock it and follow the semantics of lockfile.
 If ``running.process`` exists then it contains the PID and the creation-time of the process.
 When no such file exists, there is no other process running on this configuration.
 If there is a ``running.process`` file, it may be a leftover file or it may indicate that another process is running against this config.
 To tell the difference, determine if the PID in the file exists currently.
 If it does, check the creation-time of the process versus the one in the file.
 If these match, there is another process currently running and using this config.
 Otherwise, the file is stale -- it should be removed before starting Tahoe-LAFS.
 Some example Python code to check the above situations:
 .. literalinclude:: check_running.py
 A note about small grids
 ------------------------
--- a/newsfragments/3926.incompat
+++ b/newsfragments/3926.incompat
@ -0,0 +1,10 @@
 Record both the PID and the process creation-time:
 a new kind of pidfile in `running.process` records both
 the PID and the creation-time of the process. This facilitates
 automatic discovery of a "stale" pidfile that points to a
 currently-running process. If the recorded creation-time matches
 the creation-time of the running process, then it is a still-running
 `tahoe run` process. Otherwise, the file is stale.
 The `twistd.pid` file is no longer present.
--- a/setup.py
+++ b/setup.py
@ -138,6 +138,10 @@ install_requires = [
    "treq",
    "cbor2",
    "pycddl",
    # for pid-file support
    "psutil",
    "filelock",
 ]
 setup_requires = [
--- a/src/allmydata/scripts/runner.py
+++ b/src/allmydata/scripts/runner.py
@ -47,11 +47,6 @@ if _default_nodedir:
    NODEDIR_HELP += " [default for most commands: " + quote_local_unicode_path(_default_nodedir) + "]"
 # XXX all this 'dispatch' stuff needs to be unified + fixed up
 _control_node_dispatch = {
    "run": tahoe_run.run,
 }
 process_control_commands = [
    ("run", None, tahoe_run.RunOptions, "run a node without daemonizing"),
 ]  # type: SubCommands
@ -195,6 +190,7 @@ def parse_or_exit(config, argv, stdout, stderr):
    return config
 def dispatch(config,
             reactor,
             stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr):
    command = config.subCommand
    so = config.subOptions
@ -206,8 +202,8 @@ def dispatch(config,
    if command in create_dispatch:
        f = create_dispatch[command]
-    elif command in _control_node_dispatch:
+    elif command == "run":
-        f = _control_node_dispatch[command]
+        f = lambda config: tahoe_run.run(reactor, config)
    elif command in debug.dispatch:
        f = debug.dispatch[command]
    elif command in admin.dispatch:
@ -361,7 +357,7 @@ def _run_with_reactor(reactor, config, argv, stdout, stderr):
        stderr,
    )
    d.addCallback(_maybe_enable_eliot_logging, reactor)
-    d.addCallback(dispatch, stdout=stdout, stderr=stderr)
+    d.addCallback(dispatch, reactor, stdout=stdout, stderr=stderr)
    def _show_exception(f):
        # when task.react() notices a non-SystemExit exception, it does
        # log.err() with the failure and then exits with rc=1. We want this
--- a/src/allmydata/scripts/tahoe_run.py
+++ b/src/allmydata/scripts/tahoe_run.py
@ -19,6 +19,7 @@ import os, sys
 from allmydata.scripts.common import BasedirOptions
 from twisted.scripts import twistd
 from twisted.python import usage
 from twisted.python.filepath import FilePath
 from twisted.python.reflect import namedAny
 from twisted.internet.defer import maybeDeferred
 from twisted.application.service import Service
@ -27,6 +28,13 @@ from allmydata.scripts.default_nodedir import _default_nodedir
 from allmydata.util.encodingutil import listdir_unicode, quote_local_unicode_path
 from allmydata.util.configutil import UnknownConfigError
 from allmydata.util.deferredutil import HookMixin
 from allmydata.util.pid import (
    parse_pidfile,
    check_pid_process,
    cleanup_pidfile,
    ProcessInTheWay,
    InvalidPidFile,
 )
 from allmydata.storage.crawler import (
    MigratePickleFileError,
 )
@ -35,35 +43,34 @@ from allmydata.node import (
    PrivacyError,
 )
 def get_pidfile(basedir):
    """
    Returns the path to the PID file.
    :param basedir: the node's base directory
    :returns: the path to the PID file
    """
-    return os.path.join(basedir, u"twistd.pid")
+    return os.path.join(basedir, u"running.process")
 def get_pid_from_pidfile(pidfile):
    """
    Tries to read and return the PID stored in the node's PID file
-    (twistd.pid).
+
    :param pidfile: try to read this PID file
    :returns: A numeric PID on success, ``None`` if PID file absent or
              inaccessible, ``-1`` if PID file invalid.
    """
    try:
-        with open(pidfile, "r") as f:
+        pid, _ = parse_pidfile(pidfile)
            pid = f.read()
    except EnvironmentError:
        return None
-
+    except InvalidPidFile:
    try:
        pid = int(pid)
    except ValueError:
        return -1
    return pid
 def identify_node_type(basedir):
    """
    :return unicode: None or one of: 'client' or 'introducer'.
@ -206,7 +213,7 @@ class DaemonizeTahoeNodePlugin(object):
        return DaemonizeTheRealService(self.nodetype, self.basedir, so)
-def run(config, runApp=twistd.runApp):
+def run(reactor, config, runApp=twistd.runApp):
    """
    Runs a Tahoe-LAFS node in the foreground.
@ -227,10 +234,15 @@ def run(config, runApp=twistd.runApp):
        print("%s is not a recognizable node directory" % quoted_basedir, file=err)
        return 1
-    twistd_args = ["--nodaemon", "--rundir", basedir]
+    twistd_args = [
        # ensure twistd machinery does not daemonize.
        "--nodaemon",
        "--rundir", basedir,
    ]
    if sys.platform != "win32":
-        pidfile = get_pidfile(basedir)
+        # turn off Twisted's pid-file to use our own -- but not on
-        twistd_args.extend(["--pidfile", pidfile])
+        # windows, because twistd doesn't know about pidfiles there
        twistd_args.extend(["--pidfile", None])
    twistd_args.extend(config.twistd_args)
    twistd_args.append("DaemonizeTahoeNode") # point at our DaemonizeTahoeNodePlugin
@ -246,10 +258,18 @@ def run(config, runApp=twistd.runApp):
        return 1
    twistd_config.loadedPlugins = {"DaemonizeTahoeNode": DaemonizeTahoeNodePlugin(nodetype, basedir)}
-    # handle invalid PID file (twistd might not start otherwise)
+    # our own pid-style file contains PID and process creation time
-    if sys.platform != "win32" and get_pid_from_pidfile(pidfile) == -1:
+    pidfile = FilePath(get_pidfile(config['basedir']))
-        print("found invalid PID file in %s - deleting it" % basedir, file=err)
+    try:
-        os.remove(pidfile)
+        check_pid_process(pidfile)
    except (ProcessInTheWay, InvalidPidFile) as e:
        print("ERROR: {}".format(e), file=err)
        return 1
    else:
        reactor.addSystemEventTrigger(
            "after", "shutdown",
            lambda: cleanup_pidfile(pidfile)
        )
    # We always pass --nodaemon so twistd.runApp does not daemonize.
    print("running node in %s" % (quoted_basedir,), file=out)
--- a/src/allmydata/test/cli/test_run.py
+++ b/src/allmydata/test/cli/test_run.py
@ -12,23 +12,19 @@ from future.utils import PY2
 if PY2:
    from future.builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min  # noqa: F401
 import re
 from six.moves import (
    StringIO,
 )
-from testtools import (
+from hypothesis.strategies import text
-    skipIf,
+from hypothesis import given, assume
 )
 from testtools.matchers import (
    Contains,
    Equals,
    HasLength,
 )
 from twisted.python.runtime import (
    platform,
 )
 from twisted.python.filepath import (
    FilePath,
 )
@ -44,6 +40,10 @@ from ...scripts.tahoe_run import (
    RunOptions,
    run,
 )
 from ...util.pid import (
    check_pid_process,
    InvalidPidFile,
 )
 from ...scripts.runner import (
    parse_options
@ -151,7 +151,7 @@ class RunTests(SyncTestCase):
    """
    Tests for ``run``.
    """
-    @skipIf(platform.isWindows(), "There are no PID files on Windows.")
+
    def test_non_numeric_pid(self):
        """
        If the pidfile exists but does not contain a numeric value, a complaint to
@ -159,7 +159,7 @@ class RunTests(SyncTestCase):
        """
        basedir = FilePath(self.mktemp()).asTextMode()
        basedir.makedirs()
-        basedir.child(u"twistd.pid").setContent(b"foo")
+        basedir.child(u"running.process").setContent(b"foo")
        basedir.child(u"tahoe-client.tac").setContent(b"")
        config = RunOptions()
@ -168,17 +168,30 @@ class RunTests(SyncTestCase):
        config['basedir'] = basedir.path
        config.twistd_args = []
        reactor = MemoryReactor()
        runs = []
-        result_code = run(config, runApp=runs.append)
+        result_code = run(reactor, config, runApp=runs.append)
        self.assertThat(
            config.stderr.getvalue(),
            Contains("found invalid PID file in"),
        )
-        self.assertThat(
+        # because the pidfile is invalid we shouldn't get to the
-            runs,
+        # .run() call itself.
-            HasLength(1),
+        self.assertThat(runs, Equals([]))
-        )
+        self.assertThat(result_code, Equals(1))
-        self.assertThat(
+
-            result_code,
+    good_file_content_re = re.compile(r"\w[0-9]*\w[0-9]*\w")
-            Equals(0),
+
-        )
+    @given(text())
    def test_pidfile_contents(self, content):
        """
        invalid contents for a pidfile raise errors
        """
        assume(not self.good_file_content_re.match(content))
        pidfile = FilePath("pidfile")
        pidfile.setContent(content.encode("utf8"))
        with self.assertRaises(InvalidPidFile):
            with check_pid_process(pidfile):
                pass
--- a/src/allmydata/test/cli_node_api.py
+++ b/src/allmydata/test/cli_node_api.py
@ -134,7 +134,7 @@ class CLINodeAPI(object):
    @property
    def twistd_pid_file(self):
-        return self.basedir.child(u"twistd.pid")
+        return self.basedir.child(u"running.process")
    @property
    def node_url_file(self):
--- a/src/allmydata/test/common_util.py
+++ b/src/allmydata/test/common_util.py
@ -145,6 +145,7 @@ def run_cli_native(verb, *args, **kwargs):
    )
    d.addCallback(
        runner.dispatch,
        reactor,
        stdin=stdin,
        stdout=stdout,
        stderr=stderr,
--- a/src/allmydata/test/test_runner.py
+++ b/src/allmydata/test/test_runner.py
@ -42,16 +42,19 @@ from twisted.trial import unittest
 from twisted.internet import reactor
 from twisted.python import usage
 from twisted.python.runtime import platform
 from twisted.internet.defer import (
    inlineCallbacks,
    DeferredList,
 )
 from twisted.python.filepath import FilePath
 from twisted.python.runtime import (
    platform,
 )
 from allmydata.util import fileutil, pollmixin
 from allmydata.util.encodingutil import unicode_to_argv
 from allmydata.util.pid import (
    check_pid_process,
    _pidfile_to_lockpath,
    ProcessInTheWay,
 )
 from allmydata.test import common_util
 import allmydata
 from allmydata.scripts.runner import (
@ -418,8 +421,6 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
        tahoe.active()
        # We don't keep track of PIDs in files on Windows.
        if not platform.isWindows():
        self.assertTrue(tahoe.twistd_pid_file.exists())
        self.assertTrue(tahoe.node_url_file.exists())
@ -493,8 +494,6 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
        # change on restart
        storage_furl = fileutil.read(tahoe.storage_furl_file.path)
        # We don't keep track of PIDs in files on Windows.
        if not platform.isWindows():
        self.assertTrue(tahoe.twistd_pid_file.exists())
        # rm this so we can detect when the second incarnation is ready
@ -513,7 +512,6 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
            fileutil.read(tahoe.storage_furl_file.path),
        )
        if not platform.isWindows():
        self.assertTrue(
            tahoe.twistd_pid_file.exists(),
            "PID file ({}) didn't exist when we expected it to.  "
@ -524,11 +522,13 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
        )
        yield tahoe.stop_and_wait()
        # twistd.pid should be gone by now -- except on Windows, where
        # killing a subprocess immediately exits with no chance for
        # any shutdown code (that is, no Twisted shutdown hooks can
        # run).
        if not platform.isWindows():
            # twistd.pid should be gone by now.
            self.assertFalse(tahoe.twistd_pid_file.exists())
    def _remove(self, res, file):
        fileutil.remove(file)
        return res
@ -610,8 +610,9 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
            ),
        )
        # It should not be running (but windows shutdown can't run
        # code so the PID file still exists there).
        if not platform.isWindows():
            # It should not be running.
            self.assertFalse(tahoe.twistd_pid_file.exists())
        # Wait for the operation to *complete*.  If we got this far it's
@ -621,3 +622,42 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
        # What's left is a perfect indicator that the process has exited and
        # we won't get blamed for leaving the reactor dirty.
        yield client_running
 class PidFileLocking(SyncTestCase):
    """
    Direct tests for allmydata.util.pid functions
    """
    def test_locking(self):
        """
        Fail to create a pidfile if another process has the lock already.
        """
        # this can't just be "our" process because the locking library
        # allows the same process to acquire a lock multiple times.
        pidfile = FilePath(self.mktemp())
        lockfile = _pidfile_to_lockpath(pidfile)
        with open("other_lock.py", "w") as f:
            f.write(
                "\n".join([
                    "import filelock, time, sys",
                    "with filelock.FileLock(sys.argv[1], timeout=1):",
                    "    sys.stdout.write('.\\n')",
                    "    sys.stdout.flush()",
                    "    time.sleep(10)",
                ])
            )
        proc = Popen(
            [sys.executable, "other_lock.py", lockfile.path],
            stdout=PIPE,
            stderr=PIPE,
        )
        # make sure our subprocess has had time to acquire the lock
        # for sure (from the "." it prints)
        proc.stdout.read(2)
        # acquiring the same lock should fail; it is locked by the subprocess
        with self.assertRaises(ProcessInTheWay):
            check_pid_process(pidfile)
        proc.terminate()
--- a/src/allmydata/util/pid.py
+++ b/src/allmydata/util/pid.py
@ -0,0 +1,120 @@
 import psutil
 # the docs are a little misleading, but this is either WindowsFileLock
 # or UnixFileLock depending upon the platform we're currently on
 from filelock import FileLock, Timeout
 class ProcessInTheWay(Exception):
    """
    our pidfile points at a running process
    """
 class InvalidPidFile(Exception):
    """
    our pidfile isn't well-formed
    """
 class CannotRemovePidFile(Exception):
    """
    something went wrong removing the pidfile
    """
 def _pidfile_to_lockpath(pidfile):
    """
    internal helper.
    :returns FilePath: a path to use for file-locking the given pidfile
    """
    return pidfile.sibling("{}.lock".format(pidfile.basename()))
 def parse_pidfile(pidfile):
    """
    :param FilePath pidfile:
    :returns tuple: 2-tuple of pid, creation-time as int, float
    :raises InvalidPidFile: on error
    """
    with pidfile.open("r") as f:
        content = f.read().decode("utf8").strip()
    try:
        pid, starttime = content.split()
        pid = int(pid)
        starttime = float(starttime)
    except ValueError:
        raise InvalidPidFile(
            "found invalid PID file in {}".format(
                pidfile
            )
        )
    return pid, starttime
 def check_pid_process(pidfile):
    """
    If another instance appears to be running already, raise an
    exception.  Otherwise, write our PID + start time to the pidfile
    and arrange to delete it upon exit.
    :param FilePath pidfile: the file to read/write our PID from.
    :raises ProcessInTheWay: if a running process exists at our PID
    """
    lock_path = _pidfile_to_lockpath(pidfile)
    try:
        # a short timeout is fine, this lock should only be active
        # while someone is reading or deleting the pidfile .. and
        # facilitates testing the locking itself.
        with FileLock(lock_path.path, timeout=2):
            # check if we have another instance running already
            if pidfile.exists():
                pid, starttime = parse_pidfile(pidfile)
                try:
                    # if any other process is running at that PID, let the
                    # user decide if this is another legitimate
                    # instance. Automated programs may use the start-time to
                    # help decide this (if the PID is merely recycled, the
                    # start-time won't match).
                    psutil.Process(pid)
                    raise ProcessInTheWay(
                        "A process is already running as PID {}".format(pid)
                    )
                except psutil.NoSuchProcess:
                    print(
                        "'{pidpath}' refers to {pid} that isn't running".format(
                            pidpath=pidfile.path,
                            pid=pid,
                        )
                    )
                    # nothing is running at that PID so it must be a stale file
                    pidfile.remove()
            # write our PID + start-time to the pid-file
            proc = psutil.Process()
            with pidfile.open("w") as f:
                f.write("{} {}\n".format(proc.pid, proc.create_time()).encode("utf8"))
    except Timeout:
        raise ProcessInTheWay(
            "Another process is still locking {}".format(pidfile.path)
        )
 def cleanup_pidfile(pidfile):
    """
    Remove the pidfile specified (respecting locks). If anything at
    all goes wrong, `CannotRemovePidFile` is raised.
    """
    lock_path = _pidfile_to_lockpath(pidfile)
    with FileLock(lock_path.path):
        try:
            pidfile.remove()
        except Exception as e:
            raise CannotRemovePidFile(
                "Couldn't remove '{pidfile}': {err}.".format(
                    pidfile=pidfile.path,
                    err=e,
                )
            )