From 93bb3e995a48aae28c0f45115f4cdda33d9377c6 Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Wed, 4 May 2016 15:04:12 -0700 Subject: [PATCH] stats-gatherer: add --hostname/--location/--port Updates docs, tests, explains how to update an old gatherer. --- docs/stats.rst | 40 +++++++++-------- src/allmydata/scripts/stats_gatherer.py | 58 ++++++++++++++++++++++++- src/allmydata/stats.py | 27 +++++------- src/allmydata/test/common.py | 5 +++ src/allmydata/test/test_runner.py | 50 ++++++++++++++++++--- topfiles/2773.docs | 18 ++++++++ 6 files changed, 157 insertions(+), 41 deletions(-) create mode 100644 topfiles/2773.docs diff --git a/docs/stats.rst b/docs/stats.rst index 03213e839..8fbc8647a 100644 --- a/docs/stats.rst +++ b/docs/stats.rst @@ -279,11 +279,12 @@ boxes, as long as the stats-gatherer has a reachable IP address.) The stats-gatherer is created in the same fashion as regular tahoe client nodes and introducer nodes. Choose a base directory for the gatherer to live -in (but do not create the directory). Then run: +in (but do not create the directory). Choose the hostname that should be +advertised in the gatherer's FURL. Then run: :: - tahoe create-stats-gatherer $BASEDIR + tahoe create-stats-gatherer --hostname=HOSTNAME $BASEDIR and start it with "tahoe start $BASEDIR". Once running, the gatherer will write a FURL into $BASEDIR/stats_gatherer.furl . @@ -295,19 +296,23 @@ under a key named "stats_gatherer.furl", like so: :: [client] - stats_gatherer.furl = pb://qbo4ktl667zmtiuou6lwbjryli2brv6t@192.168.0.8:49997/wxycb4kaexzskubjnauxeoptympyf45y + stats_gatherer.furl = pb://qbo4ktl667zmtiuou6lwbjryli2brv6t@HOSTNAME:PORTNUM/wxycb4kaexzskubjnauxeoptympyf45y or simply copy the stats_gatherer.furl file into the node's base directory (next to the tahoe.cfg file): it will be interpreted in the same way. -The first time it is started, the gatherer will listen on a random unused TCP -port, so it should not conflict with anything else that you have running on -that host at that time. On subsequent runs, it will re-use the same port (to -keep its FURL consistent). To explicitly control which port it uses, write -the desired portnumber into a file named "portnum" (i.e. $BASEDIR/portnum), -and the next time the gatherer is started, it will start listening on the -given port. The portnum file is actually a "strports specification string", -as described in :doc:`configuration`. +When the gatherer is created, it will allocate a random unused TCP port, so +it should not conflict with anything else that you have running on that host +at that time. To explicitly control which port it uses, run the creation +command with ``--location=`` and ``--port=`` instead of ``--hostname=``. If +you use a hostname of ``example.org`` and a port number of ``1234``, then +run:: + + tahoe create-stats-gatherer --location=tcp:example.org:1234 --port=tcp:1234 + +``--location=`` is a Foolscap FURL hints string (so it can be a +comma-separated list of connection hints), and ``--port=`` is a Twisted +"server endpoint specification string", as described in :doc:`configuration`. Once running, the stats gatherer will create a standard JSON file in ``$BASEDIR/stats.json``. Once a minute, the gatherer will pull stats @@ -322,16 +327,17 @@ Other tools can be built to examine these stats and render them into something useful. For example, a tool could sum the "storage_server.disk_avail' values from all servers to compute a total-disk-available number for the entire grid (however, the "disk watcher" -daemon, in misc/operations_helpers/spacetime/, is better suited for this specific task). +daemon, in misc/operations_helpers/spacetime/, is better suited for this +specific task). Using Munin To Graph Stats Values ================================= -The misc/munin/ directory contains various plugins to graph stats for Tahoe -nodes. They are intended for use with the Munin_ system-management tool, which -typically polls target systems every 5 minutes and produces a web page with -graphs of various things over multiple time scales (last hour, last month, -last year). +The misc/operations_helpers/munin/ directory contains various plugins to +graph stats for Tahoe nodes. They are intended for use with the Munin_ +system-management tool, which typically polls target systems every 5 minutes +and produces a web page with graphs of various things over multiple time +scales (last hour, last month, last year). Most of the plugins are designed to pull stats from a single Tahoe node, and are configured with the e.g. http://localhost:3456/statistics?t=json URL. The diff --git a/src/allmydata/scripts/stats_gatherer.py b/src/allmydata/scripts/stats_gatherer.py index b50b0ea6d..71a5ece14 100644 --- a/src/allmydata/scripts/stats_gatherer.py +++ b/src/allmydata/scripts/stats_gatherer.py @@ -1,14 +1,59 @@ - import os, sys - +from twisted.python import usage from allmydata.scripts.common import NoDefaultBasedirOptions from allmydata.scripts.create_node import write_tac from allmydata.util.assertutil import precondition from allmydata.util.encodingutil import listdir_unicode, quote_output +from allmydata.util import fileutil, iputil class CreateStatsGathererOptions(NoDefaultBasedirOptions): subcommand_name = "create-stats-gatherer" + optParameters = [ + ("hostname", None, None, "Hostname of this machine, used to build location"), + ("location", None, None, "FURL connection hints, e.g. 'tcp:HOSTNAME:PORT'"), + ("port", None, None, "listening endpoint, e.g. 'tcp:PORT'"), + ] + def postOptions(self): + if self["hostname"] and (not self["location"]) and (not self["port"]): + pass + elif (not self["hostname"]) and self["location"] and self["port"]: + pass + else: + raise usage.UsageError("You must provide --hostname, or --location and --port.") + + description = """ + Create a "stats-gatherer" service, which is a standalone process that + collects and stores runtime statistics from many server nodes. This is a + tool for operations personnel to keep track of free disk space, server + load, and protocol activity, across a fleet of Tahoe storage servers. + + The "stats-gatherer" listens on a TCP port and publishes a Foolscap FURL + by writing it into a file named "stats_gatherer.furl". You must copy this + FURL into the servers' tahoe.cfg, as the [client] stats_gatherer.furl= + entry. Those servers will then establish a connection to the + stats-gatherer and publish their statistics on a periodic basis. The + gatherer writes a summary JSON file out to disk after each update. + + The stats-gatherer listens on a configurable port, and writes a + configurable hostname+port pair into the FURL that it publishes. There + are two configuration modes you can use. + + * In the first, you provide --hostname=, and the service chooses its own + TCP port number. If the host is named "example.org" and you provide + --hostname=example.org, the node will pick a port number (e.g. 12345) + and use location="tcp:example.org:12345" and port="tcp:12345". + + * In the second, you provide both --location= and --port=, and the + service will refrain from doing any allocation of its own. --location= + must be a Foolscap "FURL connection hint sequence", which is a + comma-separated list of "tcp:HOSTNAME:PORTNUM" strings. --port= must be + a Twisted server endpoint specification, which is generally + "tcp:PORTNUM". So, if your host is named "example.org" and you want to + use port 6789, you should provide --location=tcp:example.org:6789 and + --port=tcp:6789. You are responsible for making sure --location= and + --port= match each other. + """ def create_stats_gatherer(config, out=sys.stdout, err=sys.stderr): @@ -26,6 +71,15 @@ def create_stats_gatherer(config, out=sys.stdout, err=sys.stderr): else: os.mkdir(basedir) write_tac(basedir, "stats-gatherer") + if config["hostname"]: + portnum = iputil.allocate_tcp_port() + location = "tcp:%s:%d" % (config["hostname"], portnum) + port = "tcp:%d" % portnum + else: + location = config["location"] + port = config["port"] + fileutil.write(os.path.join(basedir, "location"), location+"\n") + fileutil.write(os.path.join(basedir, "port"), port+"\n") return 0 subCommands = [ diff --git a/src/allmydata/stats.py b/src/allmydata/stats.py index 4a203b4e3..e107c5042 100644 --- a/src/allmydata/stats.py +++ b/src/allmydata/stats.py @@ -11,7 +11,7 @@ from twisted.application.internet import TimerService from zope.interface import implements from foolscap.api import eventually, DeadReferenceError, Referenceable, Tub -from allmydata.util import log, fileutil +from allmydata.util import log from allmydata.util.encodingutil import quote_local_unicode_path from allmydata.interfaces import RIStatsProvider, RIStatsGatherer, IStatsProducer @@ -294,24 +294,19 @@ class StatsGathererService(service.MultiService): self.stats_gatherer = JSONStatsGatherer(self.basedir, verbose) self.stats_gatherer.setServiceParent(self) - portnumfile = os.path.join(self.basedir, "portnum") try: - portnum = open(portnumfile, "r").read() + with open(os.path.join(self.basedir, "location")) as f: + location = f.read().strip() except EnvironmentError: - portnum = None - self.listener = self.tub.listenOn(portnum or "tcp:0") - d = self.tub.setLocationAutomatically() - if portnum is None: - d.addCallback(self.save_portnum) - d.addCallback(self.tub_ready) - d.addErrback(log.err) + raise ValueError("Unable to find 'location' in BASEDIR, please rebuild your stats-gatherer") + try: + with open(os.path.join(self.basedir, "port")) as f: + port = f.read().strip() + except EnvironmentError: + raise ValueError("Unable to find 'port' in BASEDIR, please rebuild your stats-gatherer") - def save_portnum(self, junk): - portnum = self.listener.getPortnum() - portnumfile = os.path.join(self.basedir, 'portnum') - fileutil.write(portnumfile, '%d\n' % (portnum,)) - - def tub_ready(self, ignored): + self.tub.listenOn(port) + self.tub.setLocation(location) ff = os.path.join(self.basedir, self.furl_file) self.gatherer_furl = self.tub.registerReference(self.stats_gatherer, furlFile=ff) diff --git a/src/allmydata/test/common.py b/src/allmydata/test/common.py index 6c291d350..e93c60247 100644 --- a/src/allmydata/test/common.py +++ b/src/allmydata/test/common.py @@ -494,6 +494,11 @@ class SystemTestMixin(pollmixin.PollMixin, testutil.StallMixin): def _set_up_stats_gatherer(self, res): statsdir = self.getdir("stats_gatherer") fileutil.make_dirs(statsdir) + portnum = iputil.allocate_tcp_port() + location = "tcp:127.0.0.1:%d" % portnum + fileutil.write(os.path.join(statsdir, "location"), location) + port = "tcp:%d:interface=127.0.0.1" % portnum + fileutil.write(os.path.join(statsdir, "port"), port) self.stats_gatherer_svc = StatsGathererService(statsdir) self.stats_gatherer = self.stats_gatherer_svc.stats_gatherer self.add_service(self.stats_gatherer_svc) diff --git a/src/allmydata/test/test_runner.py b/src/allmydata/test/test_runner.py index 26ee369c0..837670d7d 100644 --- a/src/allmydata/test/test_runner.py +++ b/src/allmydata/test/test_runner.py @@ -184,14 +184,14 @@ class CreateNode(unittest.TestCase): rc = runner.runner(argv, stdout=out, stderr=err) return rc, out.getvalue(), err.getvalue() - def do_create(self, kind): + def do_create(self, kind, *args): basedir = self.workdir("test_" + kind) command = "create-" + kind is_client = kind in ("node", "client") tac = is_client and "tahoe-client.tac" or ("tahoe-" + kind + ".tac") n1 = os.path.join(basedir, command + "-n1") - argv = ["--quiet", command, "--basedir", n1] + argv = ["--quiet", command, "--basedir", n1] + list(args) rc, out, err = self.run_tahoe(argv) self.failUnlessEqual(err, "") self.failUnlessEqual(out, "") @@ -226,7 +226,7 @@ class CreateNode(unittest.TestCase): # test that the non --basedir form works too n2 = os.path.join(basedir, command + "-n2") - argv = ["--quiet", command, n2] + argv = ["--quiet", command] + list(args) + [n2] rc, out, err = self.run_tahoe(argv) self.failUnlessEqual(err, "") self.failUnlessEqual(out, "") @@ -236,7 +236,7 @@ class CreateNode(unittest.TestCase): # test the --node-directory form n3 = os.path.join(basedir, command + "-n3") - argv = ["--quiet", "--node-directory", n3, command] + argv = ["--quiet", "--node-directory", n3, command] + list(args) rc, out, err = self.run_tahoe(argv) self.failUnlessEqual(err, "") self.failUnlessEqual(out, "") @@ -247,7 +247,7 @@ class CreateNode(unittest.TestCase): if kind in ("client", "node", "introducer"): # test that the output (without --quiet) includes the base directory n4 = os.path.join(basedir, command + "-n4") - argv = [command, n4] + argv = [command] + list(args) + [n4] rc, out, err = self.run_tahoe(argv) self.failUnlessEqual(err, "") self.failUnlessIn(" created in ", out) @@ -282,7 +282,7 @@ class CreateNode(unittest.TestCase): self.do_create("introducer") def test_stats_gatherer(self): - self.do_create("stats-gatherer") + self.do_create("stats-gatherer", "--hostname=127.0.0.1") def test_subcommands(self): # no arguments should trigger a command listing, via UsageError @@ -291,6 +291,44 @@ class CreateNode(unittest.TestCase): [], run_by_human=False) + def test_stats_gatherer_good_args(self): + rc = runner.runner(["create-stats-gatherer", "--hostname=foo", + self.mktemp()]) + self.assertEqual(rc, 0) + rc = runner.runner(["create-stats-gatherer", "--location=tcp:foo:1234", + "--port=tcp:1234", self.mktemp()]) + self.assertEqual(rc, 0) + + def test_stats_gatherer_bad_args(self): + # missing hostname/location/port + argv = "create-stats-gatherer D" + self.assertRaises(usage.UsageError, runner.runner, argv.split(), + run_by_human=False) + + # missing port + argv = "create-stats-gatherer --location=foo D" + self.assertRaises(usage.UsageError, runner.runner, argv.split(), + run_by_human=False) + + # missing location + argv = "create-stats-gatherer --port=foo D" + self.assertRaises(usage.UsageError, runner.runner, argv.split(), + run_by_human=False) + + # can't provide both + argv = "create-stats-gatherer --hostname=foo --port=foo D" + self.assertRaises(usage.UsageError, runner.runner, argv.split(), + run_by_human=False) + + # can't provide both + argv = "create-stats-gatherer --hostname=foo --location=foo D" + self.assertRaises(usage.UsageError, runner.runner, argv.split(), + run_by_human=False) + + # can't provide all three + argv = "create-stats-gatherer --hostname=foo --location=foo --port=foo D" + self.assertRaises(usage.UsageError, runner.runner, argv.split(), + run_by_human=False) class RunNode(common_util.SignalMixin, unittest.TestCase, pollmixin.PollMixin, RunBinTahoeMixin): diff --git a/topfiles/2773.docs b/topfiles/2773.docs new file mode 100644 index 000000000..66a093c68 --- /dev/null +++ b/topfiles/2773.docs @@ -0,0 +1,18 @@ +The "stats-gatherer", an operation-helper service used to collect runtime +statistics from a fleet of Tahoe storage servers, must now be assigned a +hostname, or location+port pair, at creation time. The "tahoe +create-stats-gatherer" command now requires either "--hostname=", or both +"--location=" and "--port". + +Previously, "tahoe create-stats-gatherer NODEDIR" would attempt to guess its +location by running something like /sbin/ifconfig to collect local IP +addresses. While this works if the host has a public IP address (or at least +lives in the same LAN as the storage servers it monitors), most sysadmins +would prefer the FURL be created with a real hostname. + +To keep your old stats-gatherers working, with their original FURL, you must +determine a suitable --location and --port, and write their values into +NODEDIR/location and NODEDIR/port, respectively. Or you could simply rebuild +it by re-running "tahoe create-stats-gatherer" with the new arguments. + +See docs/stats.rst for details.