Merge pull request #1219 from meejah/3926.pid-time-reactor

3926: upgrade PID files to contain start-time, do file-locking and exist on Windows
2025-04-05 09:59:24 +00:00 · 2022-09-28 11:02:46 -06:00 · 2022-09-28 11:02:46 -06:00 · d2dd211420
commit d2dd211420
parent 8086e8c4f0 bef71978b6
11 changed files with 344 additions and 64 deletions
--- a/docs/check_running.py
+++ b/docs/check_running.py
@ -0,0 +1,47 @@
+
+import psutil
+import filelock
+
+
+def can_spawn_tahoe(pidfile):
+    """
+    Determine if we can spawn a Tahoe-LAFS for the given pidfile. That
+    pidfile may be deleted if it is stale.
+
+    :param pathlib.Path pidfile: the file to check, that is the Path
+        to "running.process" in a Tahoe-LAFS configuration directory
+
+    :returns bool: True if we can spawn `tahoe run` here
+    """
+    lockpath = pidfile.parent / (pidfile.name + ".lock")
+    with filelock.FileLock(lockpath):
+        try:
+            with pidfile.open("r") as f:
+                pid, create_time = f.read().strip().split(" ", 1)
+        except FileNotFoundError:
+            return True
+
+        # somewhat interesting: we have a pidfile
+        pid = int(pid)
+        create_time = float(create_time)
+
+        try:
+            proc = psutil.Process(pid)
+            # most interesting case: there _is_ a process running at the
+            # recorded PID -- but did it just happen to get that PID, or
+            # is it the very same one that wrote the file?
+            if create_time == proc.create_time():
+                # _not_ stale! another intance is still running against
+                # this configuration
+                return False
+
+        except psutil.NoSuchProcess:
+            pass
+
+        # the file is stale
+        pidfile.unlink()
+        return True
+
+
+from pathlib import Path
+print("can spawn?", can_spawn_tahoe(Path("running.process")))
--- a/docs/running.rst
+++ b/docs/running.rst
@ -124,6 +124,35 @@ Tahoe-LAFS.
 .. _magic wormhole: https://magic-wormhole.io/


+Multiple Instances
+------------------
+
+Running multiple instances against the same configuration directory isn't supported.
+This will lead to undefined behavior and could corrupt the configuration or state.
+
+We attempt to avoid this situation with a "pidfile"-style file in the config directory called ``running.process``.
+There may be a parallel file called ``running.process.lock`` in existence.
+
+The ``.lock`` file exists to make sure only one process modifies ``running.process`` at once.
+The lock file is managed by the `lockfile <https://pypi.org/project/lockfile/>`_ library.
+If you wish to make use of ``running.process`` for any reason you should also lock it and follow the semantics of lockfile.
+
+If ``running.process`` exists then it contains the PID and the creation-time of the process.
+When no such file exists, there is no other process running on this configuration.
+If there is a ``running.process`` file, it may be a leftover file or it may indicate that another process is running against this config.
+To tell the difference, determine if the PID in the file exists currently.
+If it does, check the creation-time of the process versus the one in the file.
+If these match, there is another process currently running and using this config.
+Otherwise, the file is stale -- it should be removed before starting Tahoe-LAFS.
+
+Some example Python code to check the above situations:
+
+.. literalinclude:: check_running.py
+
+
+
+
+
 A note about small grids
 ------------------------

--- a/newsfragments/3926.incompat
+++ b/newsfragments/3926.incompat
@ -0,0 +1,10 @@
+Record both the PID and the process creation-time:
+
+a new kind of pidfile in `running.process` records both
+the PID and the creation-time of the process. This facilitates
+automatic discovery of a "stale" pidfile that points to a
+currently-running process. If the recorded creation-time matches
+the creation-time of the running process, then it is a still-running
+`tahoe run` process. Otherwise, the file is stale.
+
+The `twistd.pid` file is no longer present.
--- a/setup.py
+++ b/setup.py
@ -138,6 +138,10 @@ install_requires = [
    "treq",
    "cbor2",
    "pycddl",
+
+    # for pid-file support
+    "psutil",
+    "filelock",
 ]

 setup_requires = [
--- a/src/allmydata/scripts/runner.py
+++ b/src/allmydata/scripts/runner.py
@ -47,11 +47,6 @@ if _default_nodedir:
    NODEDIR_HELP += " [default for most commands: " + quote_local_unicode_path(_default_nodedir) + "]"


-# XXX all this 'dispatch' stuff needs to be unified + fixed up
-_control_node_dispatch = {
-    "run": tahoe_run.run,
-}
-
 process_control_commands = [
    ("run", None, tahoe_run.RunOptions, "run a node without daemonizing"),
 ]  # type: SubCommands
@ -195,6 +190,7 @@ def parse_or_exit(config, argv, stdout, stderr):
    return config

 def dispatch(config,
+             reactor,
             stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr):
    command = config.subCommand
    so = config.subOptions
@ -206,8 +202,8 @@ def dispatch(config,

    if command in create_dispatch:
        f = create_dispatch[command]
-    elif command in _control_node_dispatch:
-        f = _control_node_dispatch[command]
+    elif command == "run":
+        f = lambda config: tahoe_run.run(reactor, config)
    elif command in debug.dispatch:
        f = debug.dispatch[command]
    elif command in admin.dispatch:
@ -361,7 +357,7 @@ def _run_with_reactor(reactor, config, argv, stdout, stderr):
        stderr,
    )
    d.addCallback(_maybe_enable_eliot_logging, reactor)
-    d.addCallback(dispatch, stdout=stdout, stderr=stderr)
+    d.addCallback(dispatch, reactor, stdout=stdout, stderr=stderr)
    def _show_exception(f):
        # when task.react() notices a non-SystemExit exception, it does
        # log.err() with the failure and then exits with rc=1. We want this
--- a/src/allmydata/scripts/tahoe_run.py
+++ b/src/allmydata/scripts/tahoe_run.py
@ -19,6 +19,7 @@ import os, sys
 from allmydata.scripts.common import BasedirOptions
 from twisted.scripts import twistd
 from twisted.python import usage
+from twisted.python.filepath import FilePath
 from twisted.python.reflect import namedAny
 from twisted.internet.defer import maybeDeferred
 from twisted.application.service import Service
@ -27,6 +28,13 @@ from allmydata.scripts.default_nodedir import _default_nodedir
 from allmydata.util.encodingutil import listdir_unicode, quote_local_unicode_path
 from allmydata.util.configutil import UnknownConfigError
 from allmydata.util.deferredutil import HookMixin
+from allmydata.util.pid import (
+    parse_pidfile,
+    check_pid_process,
+    cleanup_pidfile,
+    ProcessInTheWay,
+    InvalidPidFile,
+)
 from allmydata.storage.crawler import (
    MigratePickleFileError,
 )
@ -35,35 +43,34 @@ from allmydata.node import (
    PrivacyError,
 )

+
 def get_pidfile(basedir):
    """
    Returns the path to the PID file.
    :param basedir: the node's base directory
    :returns: the path to the PID file
    """
-    return os.path.join(basedir, u"twistd.pid")
+    return os.path.join(basedir, u"running.process")
+

 def get_pid_from_pidfile(pidfile):
    """
    Tries to read and return the PID stored in the node's PID file
-    (twistd.pid).
+
    :param pidfile: try to read this PID file
    :returns: A numeric PID on success, ``None`` if PID file absent or
              inaccessible, ``-1`` if PID file invalid.
    """
    try:
-        with open(pidfile, "r") as f:
-            pid = f.read()
+        pid, _ = parse_pidfile(pidfile)
    except EnvironmentError:
        return None
-
-    try:
-        pid = int(pid)
-    except ValueError:
+    except InvalidPidFile:
        return -1

    return pid

+
 def identify_node_type(basedir):
    """
    :return unicode: None or one of: 'client' or 'introducer'.
@ -206,7 +213,7 @@ class DaemonizeTahoeNodePlugin(object):
        return DaemonizeTheRealService(self.nodetype, self.basedir, so)


-def run(config, runApp=twistd.runApp):
+def run(reactor, config, runApp=twistd.runApp):
    """
    Runs a Tahoe-LAFS node in the foreground.

@ -227,10 +234,15 @@ def run(config, runApp=twistd.runApp):
        print("%s is not a recognizable node directory" % quoted_basedir, file=err)
        return 1

-    twistd_args = ["--nodaemon", "--rundir", basedir]
+    twistd_args = [
+        # ensure twistd machinery does not daemonize.
+        "--nodaemon",
+        "--rundir", basedir,
+    ]
    if sys.platform != "win32":
-        pidfile = get_pidfile(basedir)
-        twistd_args.extend(["--pidfile", pidfile])
+        # turn off Twisted's pid-file to use our own -- but not on
+        # windows, because twistd doesn't know about pidfiles there
+        twistd_args.extend(["--pidfile", None])
    twistd_args.extend(config.twistd_args)
    twistd_args.append("DaemonizeTahoeNode") # point at our DaemonizeTahoeNodePlugin

@ -246,10 +258,18 @@ def run(config, runApp=twistd.runApp):
        return 1
    twistd_config.loadedPlugins = {"DaemonizeTahoeNode": DaemonizeTahoeNodePlugin(nodetype, basedir)}

-    # handle invalid PID file (twistd might not start otherwise)
-    if sys.platform != "win32" and get_pid_from_pidfile(pidfile) == -1:
-        print("found invalid PID file in %s - deleting it" % basedir, file=err)
-        os.remove(pidfile)
+    # our own pid-style file contains PID and process creation time
+    pidfile = FilePath(get_pidfile(config['basedir']))
+    try:
+        check_pid_process(pidfile)
+    except (ProcessInTheWay, InvalidPidFile) as e:
+        print("ERROR: {}".format(e), file=err)
+        return 1
+    else:
+        reactor.addSystemEventTrigger(
+            "after", "shutdown",
+            lambda: cleanup_pidfile(pidfile)
+        )

    # We always pass --nodaemon so twistd.runApp does not daemonize.
    print("running node in %s" % (quoted_basedir,), file=out)
--- a/src/allmydata/test/cli/test_run.py
+++ b/src/allmydata/test/cli/test_run.py
@ -12,23 +12,19 @@ from future.utils import PY2
 if PY2:
    from future.builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min  # noqa: F401

+import re
 from six.moves import (
    StringIO,
 )

-from testtools import (
-    skipIf,
-)
+from hypothesis.strategies import text
+from hypothesis import given, assume

 from testtools.matchers import (
    Contains,
    Equals,
-    HasLength,
 )

-from twisted.python.runtime import (
-    platform,
-)
 from twisted.python.filepath import (
    FilePath,
 )
@ -44,6 +40,10 @@ from ...scripts.tahoe_run import (
    RunOptions,
    run,
 )
+from ...util.pid import (
+    check_pid_process,
+    InvalidPidFile,
+)

 from ...scripts.runner import (
    parse_options
@ -151,7 +151,7 @@ class RunTests(SyncTestCase):
    """
    Tests for ``run``.
    """
-    @skipIf(platform.isWindows(), "There are no PID files on Windows.")
+
    def test_non_numeric_pid(self):
        """
        If the pidfile exists but does not contain a numeric value, a complaint to
@ -159,7 +159,7 @@ class RunTests(SyncTestCase):
        """
        basedir = FilePath(self.mktemp()).asTextMode()
        basedir.makedirs()
-        basedir.child(u"twistd.pid").setContent(b"foo")
+        basedir.child(u"running.process").setContent(b"foo")
        basedir.child(u"tahoe-client.tac").setContent(b"")

        config = RunOptions()
@ -168,17 +168,30 @@ class RunTests(SyncTestCase):
        config['basedir'] = basedir.path
        config.twistd_args = []

+        reactor = MemoryReactor()
+
        runs = []
-        result_code = run(config, runApp=runs.append)
+        result_code = run(reactor, config, runApp=runs.append)
        self.assertThat(
            config.stderr.getvalue(),
            Contains("found invalid PID file in"),
        )
-        self.assertThat(
-            runs,
-            HasLength(1),
-        )
-        self.assertThat(
-            result_code,
-            Equals(0),
-        )
+        # because the pidfile is invalid we shouldn't get to the
+        # .run() call itself.
+        self.assertThat(runs, Equals([]))
+        self.assertThat(result_code, Equals(1))
+
+    good_file_content_re = re.compile(r"\w[0-9]*\w[0-9]*\w")
+
+    @given(text())
+    def test_pidfile_contents(self, content):
+        """
+        invalid contents for a pidfile raise errors
+        """
+        assume(not self.good_file_content_re.match(content))
+        pidfile = FilePath("pidfile")
+        pidfile.setContent(content.encode("utf8"))
+
+        with self.assertRaises(InvalidPidFile):
+            with check_pid_process(pidfile):
+                pass
--- a/src/allmydata/test/cli_node_api.py
+++ b/src/allmydata/test/cli_node_api.py
@ -134,7 +134,7 @@ class CLINodeAPI(object):

    @property
    def twistd_pid_file(self):
-        return self.basedir.child(u"twistd.pid")
+        return self.basedir.child(u"running.process")

    @property
    def node_url_file(self):
--- a/src/allmydata/test/common_util.py
+++ b/src/allmydata/test/common_util.py
@ -145,6 +145,7 @@ def run_cli_native(verb, *args, **kwargs):
    )
    d.addCallback(
        runner.dispatch,
+        reactor,
        stdin=stdin,
        stdout=stdout,
        stderr=stderr,
--- a/src/allmydata/test/test_runner.py
+++ b/src/allmydata/test/test_runner.py
@ -42,16 +42,19 @@ from twisted.trial import unittest

 from twisted.internet import reactor
 from twisted.python import usage
+from twisted.python.runtime import platform
 from twisted.internet.defer import (
    inlineCallbacks,
    DeferredList,
 )
 from twisted.python.filepath import FilePath
-from twisted.python.runtime import (
-    platform,
-)
 from allmydata.util import fileutil, pollmixin
 from allmydata.util.encodingutil import unicode_to_argv
+from allmydata.util.pid import (
+    check_pid_process,
+    _pidfile_to_lockpath,
+    ProcessInTheWay,
+)
 from allmydata.test import common_util
 import allmydata
 from allmydata.scripts.runner import (
@ -418,9 +421,7 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):

        tahoe.active()

-        # We don't keep track of PIDs in files on Windows.
-        if not platform.isWindows():
-            self.assertTrue(tahoe.twistd_pid_file.exists())
+        self.assertTrue(tahoe.twistd_pid_file.exists())
        self.assertTrue(tahoe.node_url_file.exists())

        # rm this so we can detect when the second incarnation is ready
@ -493,9 +494,7 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
        # change on restart
        storage_furl = fileutil.read(tahoe.storage_furl_file.path)

-        # We don't keep track of PIDs in files on Windows.
-        if not platform.isWindows():
-            self.assertTrue(tahoe.twistd_pid_file.exists())
+        self.assertTrue(tahoe.twistd_pid_file.exists())

        # rm this so we can detect when the second incarnation is ready
        tahoe.node_url_file.remove()
@ -513,22 +512,23 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
            fileutil.read(tahoe.storage_furl_file.path),
        )

-        if not platform.isWindows():
-            self.assertTrue(
-                tahoe.twistd_pid_file.exists(),
-                "PID file ({}) didn't exist when we expected it to.  "
-                "These exist: {}".format(
-                    tahoe.twistd_pid_file,
-                    tahoe.twistd_pid_file.parent().listdir(),
-                ),
-            )
+        self.assertTrue(
+            tahoe.twistd_pid_file.exists(),
+            "PID file ({}) didn't exist when we expected it to.  "
+            "These exist: {}".format(
+                tahoe.twistd_pid_file,
+                tahoe.twistd_pid_file.parent().listdir(),
+            ),
+        )
        yield tahoe.stop_and_wait()

+        # twistd.pid should be gone by now -- except on Windows, where
+        # killing a subprocess immediately exits with no chance for
+        # any shutdown code (that is, no Twisted shutdown hooks can
+        # run).
        if not platform.isWindows():
-            # twistd.pid should be gone by now.
            self.assertFalse(tahoe.twistd_pid_file.exists())

-
    def _remove(self, res, file):
        fileutil.remove(file)
        return res
@ -610,8 +610,9 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
            ),
        )

+        # It should not be running (but windows shutdown can't run
+        # code so the PID file still exists there).
        if not platform.isWindows():
-            # It should not be running.
            self.assertFalse(tahoe.twistd_pid_file.exists())

        # Wait for the operation to *complete*.  If we got this far it's
@ -621,3 +622,42 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin):
        # What's left is a perfect indicator that the process has exited and
        # we won't get blamed for leaving the reactor dirty.
        yield client_running
+
+
+class PidFileLocking(SyncTestCase):
+    """
+    Direct tests for allmydata.util.pid functions
+    """
+
+    def test_locking(self):
+        """
+        Fail to create a pidfile if another process has the lock already.
+        """
+        # this can't just be "our" process because the locking library
+        # allows the same process to acquire a lock multiple times.
+        pidfile = FilePath(self.mktemp())
+        lockfile = _pidfile_to_lockpath(pidfile)
+
+        with open("other_lock.py", "w") as f:
+            f.write(
+                "\n".join([
+                    "import filelock, time, sys",
+                    "with filelock.FileLock(sys.argv[1], timeout=1):",
+                    "    sys.stdout.write('.\\n')",
+                    "    sys.stdout.flush()",
+                    "    time.sleep(10)",
+                ])
+            )
+        proc = Popen(
+            [sys.executable, "other_lock.py", lockfile.path],
+            stdout=PIPE,
+            stderr=PIPE,
+        )
+        # make sure our subprocess has had time to acquire the lock
+        # for sure (from the "." it prints)
+        proc.stdout.read(2)
+
+        # acquiring the same lock should fail; it is locked by the subprocess
+        with self.assertRaises(ProcessInTheWay):
+            check_pid_process(pidfile)
+        proc.terminate()
--- a/src/allmydata/util/pid.py
+++ b/src/allmydata/util/pid.py
@ -0,0 +1,120 @@
+import psutil
+
+# the docs are a little misleading, but this is either WindowsFileLock
+# or UnixFileLock depending upon the platform we're currently on
+from filelock import FileLock, Timeout
+
+
+class ProcessInTheWay(Exception):
+    """
+    our pidfile points at a running process
+    """
+
+
+class InvalidPidFile(Exception):
+    """
+    our pidfile isn't well-formed
+    """
+
+
+class CannotRemovePidFile(Exception):
+    """
+    something went wrong removing the pidfile
+    """
+
+
+def _pidfile_to_lockpath(pidfile):
+    """
+    internal helper.
+    :returns FilePath: a path to use for file-locking the given pidfile
+    """
+    return pidfile.sibling("{}.lock".format(pidfile.basename()))
+
+
+def parse_pidfile(pidfile):
+    """
+    :param FilePath pidfile:
+    :returns tuple: 2-tuple of pid, creation-time as int, float
+    :raises InvalidPidFile: on error
+    """
+    with pidfile.open("r") as f:
+        content = f.read().decode("utf8").strip()
+    try:
+        pid, starttime = content.split()
+        pid = int(pid)
+        starttime = float(starttime)
+    except ValueError:
+        raise InvalidPidFile(
+            "found invalid PID file in {}".format(
+                pidfile
+            )
+        )
+    return pid, starttime
+
+
+def check_pid_process(pidfile):
+    """
+    If another instance appears to be running already, raise an
+    exception.  Otherwise, write our PID + start time to the pidfile
+    and arrange to delete it upon exit.
+
+    :param FilePath pidfile: the file to read/write our PID from.
+
+    :raises ProcessInTheWay: if a running process exists at our PID
+    """
+    lock_path = _pidfile_to_lockpath(pidfile)
+
+    try:
+        # a short timeout is fine, this lock should only be active
+        # while someone is reading or deleting the pidfile .. and
+        # facilitates testing the locking itself.
+        with FileLock(lock_path.path, timeout=2):
+            # check if we have another instance running already
+            if pidfile.exists():
+                pid, starttime = parse_pidfile(pidfile)
+                try:
+                    # if any other process is running at that PID, let the
+                    # user decide if this is another legitimate
+                    # instance. Automated programs may use the start-time to
+                    # help decide this (if the PID is merely recycled, the
+                    # start-time won't match).
+                    psutil.Process(pid)
+                    raise ProcessInTheWay(
+                        "A process is already running as PID {}".format(pid)
+                    )
+                except psutil.NoSuchProcess:
+                    print(
+                        "'{pidpath}' refers to {pid} that isn't running".format(
+                            pidpath=pidfile.path,
+                            pid=pid,
+                        )
+                    )
+                    # nothing is running at that PID so it must be a stale file
+                    pidfile.remove()
+
+            # write our PID + start-time to the pid-file
+            proc = psutil.Process()
+            with pidfile.open("w") as f:
+                f.write("{} {}\n".format(proc.pid, proc.create_time()).encode("utf8"))
+    except Timeout:
+        raise ProcessInTheWay(
+            "Another process is still locking {}".format(pidfile.path)
+        )
+
+
+def cleanup_pidfile(pidfile):
+    """
+    Remove the pidfile specified (respecting locks). If anything at
+    all goes wrong, `CannotRemovePidFile` is raised.
+    """
+    lock_path = _pidfile_to_lockpath(pidfile)
+    with FileLock(lock_path.path):
+        try:
+            pidfile.remove()
+        except Exception as e:
+            raise CannotRemovePidFile(
+                "Couldn't remove '{pidfile}': {err}.".format(
+                    pidfile=pidfile.path,
+                    err=e,
+                )
+            )