diff --git a/docs/check_running.py b/docs/check_running.py new file mode 100644 index 000000000..2705f1721 --- /dev/null +++ b/docs/check_running.py @@ -0,0 +1,47 @@ + +import psutil +import filelock + + +def can_spawn_tahoe(pidfile): + """ + Determine if we can spawn a Tahoe-LAFS for the given pidfile. That + pidfile may be deleted if it is stale. + + :param pathlib.Path pidfile: the file to check, that is the Path + to "running.process" in a Tahoe-LAFS configuration directory + + :returns bool: True if we can spawn `tahoe run` here + """ + lockpath = pidfile.parent / (pidfile.name + ".lock") + with filelock.FileLock(lockpath): + try: + with pidfile.open("r") as f: + pid, create_time = f.read().strip().split(" ", 1) + except FileNotFoundError: + return True + + # somewhat interesting: we have a pidfile + pid = int(pid) + create_time = float(create_time) + + try: + proc = psutil.Process(pid) + # most interesting case: there _is_ a process running at the + # recorded PID -- but did it just happen to get that PID, or + # is it the very same one that wrote the file? + if create_time == proc.create_time(): + # _not_ stale! another intance is still running against + # this configuration + return False + + except psutil.NoSuchProcess: + pass + + # the file is stale + pidfile.unlink() + return True + + +from pathlib import Path +print("can spawn?", can_spawn_tahoe(Path("running.process"))) diff --git a/docs/running.rst b/docs/running.rst index 406c8200b..263448735 100644 --- a/docs/running.rst +++ b/docs/running.rst @@ -124,6 +124,35 @@ Tahoe-LAFS. .. _magic wormhole: https://magic-wormhole.io/ +Multiple Instances +------------------ + +Running multiple instances against the same configuration directory isn't supported. +This will lead to undefined behavior and could corrupt the configuration or state. + +We attempt to avoid this situation with a "pidfile"-style file in the config directory called ``running.process``. +There may be a parallel file called ``running.process.lock`` in existence. + +The ``.lock`` file exists to make sure only one process modifies ``running.process`` at once. +The lock file is managed by the `lockfile `_ library. +If you wish to make use of ``running.process`` for any reason you should also lock it and follow the semantics of lockfile. + +If ``running.process`` exists then it contains the PID and the creation-time of the process. +When no such file exists, there is no other process running on this configuration. +If there is a ``running.process`` file, it may be a leftover file or it may indicate that another process is running against this config. +To tell the difference, determine if the PID in the file exists currently. +If it does, check the creation-time of the process versus the one in the file. +If these match, there is another process currently running and using this config. +Otherwise, the file is stale -- it should be removed before starting Tahoe-LAFS. + +Some example Python code to check the above situations: + +.. literalinclude:: check_running.py + + + + + A note about small grids ------------------------ diff --git a/newsfragments/3926.incompat b/newsfragments/3926.incompat new file mode 100644 index 000000000..674ad289c --- /dev/null +++ b/newsfragments/3926.incompat @@ -0,0 +1,10 @@ +Record both the PID and the process creation-time: + +a new kind of pidfile in `running.process` records both +the PID and the creation-time of the process. This facilitates +automatic discovery of a "stale" pidfile that points to a +currently-running process. If the recorded creation-time matches +the creation-time of the running process, then it is a still-running +`tahoe run` process. Otherwise, the file is stale. + +The `twistd.pid` file is no longer present. \ No newline at end of file diff --git a/setup.py b/setup.py index c3ee4eb90..d99831347 100644 --- a/setup.py +++ b/setup.py @@ -138,6 +138,10 @@ install_requires = [ "treq", "cbor2", "pycddl", + + # for pid-file support + "psutil", + "filelock", ] setup_requires = [ diff --git a/src/allmydata/scripts/runner.py b/src/allmydata/scripts/runner.py index a0d8a752b..756c26f2c 100644 --- a/src/allmydata/scripts/runner.py +++ b/src/allmydata/scripts/runner.py @@ -47,11 +47,6 @@ if _default_nodedir: NODEDIR_HELP += " [default for most commands: " + quote_local_unicode_path(_default_nodedir) + "]" -# XXX all this 'dispatch' stuff needs to be unified + fixed up -_control_node_dispatch = { - "run": tahoe_run.run, -} - process_control_commands = [ ("run", None, tahoe_run.RunOptions, "run a node without daemonizing"), ] # type: SubCommands @@ -195,6 +190,7 @@ def parse_or_exit(config, argv, stdout, stderr): return config def dispatch(config, + reactor, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr): command = config.subCommand so = config.subOptions @@ -206,8 +202,8 @@ def dispatch(config, if command in create_dispatch: f = create_dispatch[command] - elif command in _control_node_dispatch: - f = _control_node_dispatch[command] + elif command == "run": + f = lambda config: tahoe_run.run(reactor, config) elif command in debug.dispatch: f = debug.dispatch[command] elif command in admin.dispatch: @@ -361,7 +357,7 @@ def _run_with_reactor(reactor, config, argv, stdout, stderr): stderr, ) d.addCallback(_maybe_enable_eliot_logging, reactor) - d.addCallback(dispatch, stdout=stdout, stderr=stderr) + d.addCallback(dispatch, reactor, stdout=stdout, stderr=stderr) def _show_exception(f): # when task.react() notices a non-SystemExit exception, it does # log.err() with the failure and then exits with rc=1. We want this diff --git a/src/allmydata/scripts/tahoe_run.py b/src/allmydata/scripts/tahoe_run.py index 51be32ee3..e22e8c307 100644 --- a/src/allmydata/scripts/tahoe_run.py +++ b/src/allmydata/scripts/tahoe_run.py @@ -19,6 +19,7 @@ import os, sys from allmydata.scripts.common import BasedirOptions from twisted.scripts import twistd from twisted.python import usage +from twisted.python.filepath import FilePath from twisted.python.reflect import namedAny from twisted.internet.defer import maybeDeferred from twisted.application.service import Service @@ -27,6 +28,13 @@ from allmydata.scripts.default_nodedir import _default_nodedir from allmydata.util.encodingutil import listdir_unicode, quote_local_unicode_path from allmydata.util.configutil import UnknownConfigError from allmydata.util.deferredutil import HookMixin +from allmydata.util.pid import ( + parse_pidfile, + check_pid_process, + cleanup_pidfile, + ProcessInTheWay, + InvalidPidFile, +) from allmydata.storage.crawler import ( MigratePickleFileError, ) @@ -35,35 +43,34 @@ from allmydata.node import ( PrivacyError, ) + def get_pidfile(basedir): """ Returns the path to the PID file. :param basedir: the node's base directory :returns: the path to the PID file """ - return os.path.join(basedir, u"twistd.pid") + return os.path.join(basedir, u"running.process") + def get_pid_from_pidfile(pidfile): """ Tries to read and return the PID stored in the node's PID file - (twistd.pid). + :param pidfile: try to read this PID file :returns: A numeric PID on success, ``None`` if PID file absent or inaccessible, ``-1`` if PID file invalid. """ try: - with open(pidfile, "r") as f: - pid = f.read() + pid, _ = parse_pidfile(pidfile) except EnvironmentError: return None - - try: - pid = int(pid) - except ValueError: + except InvalidPidFile: return -1 return pid + def identify_node_type(basedir): """ :return unicode: None or one of: 'client' or 'introducer'. @@ -206,7 +213,7 @@ class DaemonizeTahoeNodePlugin(object): return DaemonizeTheRealService(self.nodetype, self.basedir, so) -def run(config, runApp=twistd.runApp): +def run(reactor, config, runApp=twistd.runApp): """ Runs a Tahoe-LAFS node in the foreground. @@ -227,10 +234,15 @@ def run(config, runApp=twistd.runApp): print("%s is not a recognizable node directory" % quoted_basedir, file=err) return 1 - twistd_args = ["--nodaemon", "--rundir", basedir] + twistd_args = [ + # ensure twistd machinery does not daemonize. + "--nodaemon", + "--rundir", basedir, + ] if sys.platform != "win32": - pidfile = get_pidfile(basedir) - twistd_args.extend(["--pidfile", pidfile]) + # turn off Twisted's pid-file to use our own -- but not on + # windows, because twistd doesn't know about pidfiles there + twistd_args.extend(["--pidfile", None]) twistd_args.extend(config.twistd_args) twistd_args.append("DaemonizeTahoeNode") # point at our DaemonizeTahoeNodePlugin @@ -246,10 +258,18 @@ def run(config, runApp=twistd.runApp): return 1 twistd_config.loadedPlugins = {"DaemonizeTahoeNode": DaemonizeTahoeNodePlugin(nodetype, basedir)} - # handle invalid PID file (twistd might not start otherwise) - if sys.platform != "win32" and get_pid_from_pidfile(pidfile) == -1: - print("found invalid PID file in %s - deleting it" % basedir, file=err) - os.remove(pidfile) + # our own pid-style file contains PID and process creation time + pidfile = FilePath(get_pidfile(config['basedir'])) + try: + check_pid_process(pidfile) + except (ProcessInTheWay, InvalidPidFile) as e: + print("ERROR: {}".format(e), file=err) + return 1 + else: + reactor.addSystemEventTrigger( + "after", "shutdown", + lambda: cleanup_pidfile(pidfile) + ) # We always pass --nodaemon so twistd.runApp does not daemonize. print("running node in %s" % (quoted_basedir,), file=out) diff --git a/src/allmydata/test/cli/test_run.py b/src/allmydata/test/cli/test_run.py index 28613e8c1..e84f52096 100644 --- a/src/allmydata/test/cli/test_run.py +++ b/src/allmydata/test/cli/test_run.py @@ -12,23 +12,19 @@ from future.utils import PY2 if PY2: from future.builtins import filter, map, zip, ascii, chr, hex, input, next, oct, open, pow, round, super, bytes, dict, list, object, range, str, max, min # noqa: F401 +import re from six.moves import ( StringIO, ) -from testtools import ( - skipIf, -) +from hypothesis.strategies import text +from hypothesis import given, assume from testtools.matchers import ( Contains, Equals, - HasLength, ) -from twisted.python.runtime import ( - platform, -) from twisted.python.filepath import ( FilePath, ) @@ -44,6 +40,10 @@ from ...scripts.tahoe_run import ( RunOptions, run, ) +from ...util.pid import ( + check_pid_process, + InvalidPidFile, +) from ...scripts.runner import ( parse_options @@ -151,7 +151,7 @@ class RunTests(SyncTestCase): """ Tests for ``run``. """ - @skipIf(platform.isWindows(), "There are no PID files on Windows.") + def test_non_numeric_pid(self): """ If the pidfile exists but does not contain a numeric value, a complaint to @@ -159,7 +159,7 @@ class RunTests(SyncTestCase): """ basedir = FilePath(self.mktemp()).asTextMode() basedir.makedirs() - basedir.child(u"twistd.pid").setContent(b"foo") + basedir.child(u"running.process").setContent(b"foo") basedir.child(u"tahoe-client.tac").setContent(b"") config = RunOptions() @@ -168,17 +168,30 @@ class RunTests(SyncTestCase): config['basedir'] = basedir.path config.twistd_args = [] + reactor = MemoryReactor() + runs = [] - result_code = run(config, runApp=runs.append) + result_code = run(reactor, config, runApp=runs.append) self.assertThat( config.stderr.getvalue(), Contains("found invalid PID file in"), ) - self.assertThat( - runs, - HasLength(1), - ) - self.assertThat( - result_code, - Equals(0), - ) + # because the pidfile is invalid we shouldn't get to the + # .run() call itself. + self.assertThat(runs, Equals([])) + self.assertThat(result_code, Equals(1)) + + good_file_content_re = re.compile(r"\w[0-9]*\w[0-9]*\w") + + @given(text()) + def test_pidfile_contents(self, content): + """ + invalid contents for a pidfile raise errors + """ + assume(not self.good_file_content_re.match(content)) + pidfile = FilePath("pidfile") + pidfile.setContent(content.encode("utf8")) + + with self.assertRaises(InvalidPidFile): + with check_pid_process(pidfile): + pass diff --git a/src/allmydata/test/cli_node_api.py b/src/allmydata/test/cli_node_api.py index 410796be2..c324d5565 100644 --- a/src/allmydata/test/cli_node_api.py +++ b/src/allmydata/test/cli_node_api.py @@ -134,7 +134,7 @@ class CLINodeAPI(object): @property def twistd_pid_file(self): - return self.basedir.child(u"twistd.pid") + return self.basedir.child(u"running.process") @property def node_url_file(self): diff --git a/src/allmydata/test/common_util.py b/src/allmydata/test/common_util.py index e63c3eef8..b6d352ab1 100644 --- a/src/allmydata/test/common_util.py +++ b/src/allmydata/test/common_util.py @@ -145,6 +145,7 @@ def run_cli_native(verb, *args, **kwargs): ) d.addCallback( runner.dispatch, + reactor, stdin=stdin, stdout=stdout, stderr=stderr, diff --git a/src/allmydata/test/test_runner.py b/src/allmydata/test/test_runner.py index 3eb6b8a34..74e3f803e 100644 --- a/src/allmydata/test/test_runner.py +++ b/src/allmydata/test/test_runner.py @@ -42,16 +42,19 @@ from twisted.trial import unittest from twisted.internet import reactor from twisted.python import usage +from twisted.python.runtime import platform from twisted.internet.defer import ( inlineCallbacks, DeferredList, ) from twisted.python.filepath import FilePath -from twisted.python.runtime import ( - platform, -) from allmydata.util import fileutil, pollmixin from allmydata.util.encodingutil import unicode_to_argv +from allmydata.util.pid import ( + check_pid_process, + _pidfile_to_lockpath, + ProcessInTheWay, +) from allmydata.test import common_util import allmydata from allmydata.scripts.runner import ( @@ -418,9 +421,7 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin): tahoe.active() - # We don't keep track of PIDs in files on Windows. - if not platform.isWindows(): - self.assertTrue(tahoe.twistd_pid_file.exists()) + self.assertTrue(tahoe.twistd_pid_file.exists()) self.assertTrue(tahoe.node_url_file.exists()) # rm this so we can detect when the second incarnation is ready @@ -493,9 +494,7 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin): # change on restart storage_furl = fileutil.read(tahoe.storage_furl_file.path) - # We don't keep track of PIDs in files on Windows. - if not platform.isWindows(): - self.assertTrue(tahoe.twistd_pid_file.exists()) + self.assertTrue(tahoe.twistd_pid_file.exists()) # rm this so we can detect when the second incarnation is ready tahoe.node_url_file.remove() @@ -513,22 +512,23 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin): fileutil.read(tahoe.storage_furl_file.path), ) - if not platform.isWindows(): - self.assertTrue( - tahoe.twistd_pid_file.exists(), - "PID file ({}) didn't exist when we expected it to. " - "These exist: {}".format( - tahoe.twistd_pid_file, - tahoe.twistd_pid_file.parent().listdir(), - ), - ) + self.assertTrue( + tahoe.twistd_pid_file.exists(), + "PID file ({}) didn't exist when we expected it to. " + "These exist: {}".format( + tahoe.twistd_pid_file, + tahoe.twistd_pid_file.parent().listdir(), + ), + ) yield tahoe.stop_and_wait() + # twistd.pid should be gone by now -- except on Windows, where + # killing a subprocess immediately exits with no chance for + # any shutdown code (that is, no Twisted shutdown hooks can + # run). if not platform.isWindows(): - # twistd.pid should be gone by now. self.assertFalse(tahoe.twistd_pid_file.exists()) - def _remove(self, res, file): fileutil.remove(file) return res @@ -610,8 +610,9 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin): ), ) + # It should not be running (but windows shutdown can't run + # code so the PID file still exists there). if not platform.isWindows(): - # It should not be running. self.assertFalse(tahoe.twistd_pid_file.exists()) # Wait for the operation to *complete*. If we got this far it's @@ -621,3 +622,42 @@ class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin): # What's left is a perfect indicator that the process has exited and # we won't get blamed for leaving the reactor dirty. yield client_running + + +class PidFileLocking(SyncTestCase): + """ + Direct tests for allmydata.util.pid functions + """ + + def test_locking(self): + """ + Fail to create a pidfile if another process has the lock already. + """ + # this can't just be "our" process because the locking library + # allows the same process to acquire a lock multiple times. + pidfile = FilePath(self.mktemp()) + lockfile = _pidfile_to_lockpath(pidfile) + + with open("other_lock.py", "w") as f: + f.write( + "\n".join([ + "import filelock, time, sys", + "with filelock.FileLock(sys.argv[1], timeout=1):", + " sys.stdout.write('.\\n')", + " sys.stdout.flush()", + " time.sleep(10)", + ]) + ) + proc = Popen( + [sys.executable, "other_lock.py", lockfile.path], + stdout=PIPE, + stderr=PIPE, + ) + # make sure our subprocess has had time to acquire the lock + # for sure (from the "." it prints) + proc.stdout.read(2) + + # acquiring the same lock should fail; it is locked by the subprocess + with self.assertRaises(ProcessInTheWay): + check_pid_process(pidfile) + proc.terminate() diff --git a/src/allmydata/util/pid.py b/src/allmydata/util/pid.py new file mode 100644 index 000000000..f12c201d1 --- /dev/null +++ b/src/allmydata/util/pid.py @@ -0,0 +1,120 @@ +import psutil + +# the docs are a little misleading, but this is either WindowsFileLock +# or UnixFileLock depending upon the platform we're currently on +from filelock import FileLock, Timeout + + +class ProcessInTheWay(Exception): + """ + our pidfile points at a running process + """ + + +class InvalidPidFile(Exception): + """ + our pidfile isn't well-formed + """ + + +class CannotRemovePidFile(Exception): + """ + something went wrong removing the pidfile + """ + + +def _pidfile_to_lockpath(pidfile): + """ + internal helper. + :returns FilePath: a path to use for file-locking the given pidfile + """ + return pidfile.sibling("{}.lock".format(pidfile.basename())) + + +def parse_pidfile(pidfile): + """ + :param FilePath pidfile: + :returns tuple: 2-tuple of pid, creation-time as int, float + :raises InvalidPidFile: on error + """ + with pidfile.open("r") as f: + content = f.read().decode("utf8").strip() + try: + pid, starttime = content.split() + pid = int(pid) + starttime = float(starttime) + except ValueError: + raise InvalidPidFile( + "found invalid PID file in {}".format( + pidfile + ) + ) + return pid, starttime + + +def check_pid_process(pidfile): + """ + If another instance appears to be running already, raise an + exception. Otherwise, write our PID + start time to the pidfile + and arrange to delete it upon exit. + + :param FilePath pidfile: the file to read/write our PID from. + + :raises ProcessInTheWay: if a running process exists at our PID + """ + lock_path = _pidfile_to_lockpath(pidfile) + + try: + # a short timeout is fine, this lock should only be active + # while someone is reading or deleting the pidfile .. and + # facilitates testing the locking itself. + with FileLock(lock_path.path, timeout=2): + # check if we have another instance running already + if pidfile.exists(): + pid, starttime = parse_pidfile(pidfile) + try: + # if any other process is running at that PID, let the + # user decide if this is another legitimate + # instance. Automated programs may use the start-time to + # help decide this (if the PID is merely recycled, the + # start-time won't match). + psutil.Process(pid) + raise ProcessInTheWay( + "A process is already running as PID {}".format(pid) + ) + except psutil.NoSuchProcess: + print( + "'{pidpath}' refers to {pid} that isn't running".format( + pidpath=pidfile.path, + pid=pid, + ) + ) + # nothing is running at that PID so it must be a stale file + pidfile.remove() + + # write our PID + start-time to the pid-file + proc = psutil.Process() + with pidfile.open("w") as f: + f.write("{} {}\n".format(proc.pid, proc.create_time()).encode("utf8")) + except Timeout: + raise ProcessInTheWay( + "Another process is still locking {}".format(pidfile.path) + ) + + +def cleanup_pidfile(pidfile): + """ + Remove the pidfile specified (respecting locks). If anything at + all goes wrong, `CannotRemovePidFile` is raised. + """ + lock_path = _pidfile_to_lockpath(pidfile) + with FileLock(lock_path.path): + try: + pidfile.remove() + except Exception as e: + raise CannotRemovePidFile( + "Couldn't remove '{pidfile}': {err}.".format( + pidfile=pidfile.path, + err=e, + ) + )