mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2025-01-09 22:42:57 +00:00
311 lines
11 KiB
Python
311 lines
11 KiB
Python
# -*- python -*-
|
|
|
|
"""
|
|
Run this tool with twistd in its own directory, with a file named 'urls.txt'
|
|
describing which nodes to query. Make sure to copy diskwatcher.py into the
|
|
same directory. It will request disk-usage numbers from the nodes once per
|
|
hour (or slower), and store them in a local database. It will compute
|
|
usage-per-unit time values over several time ranges and make them available
|
|
through an HTTP query (using ./webport). It will also provide an estimate of
|
|
how much time is left before the grid's storage is exhausted.
|
|
|
|
There are munin plugins (named tahoe_doomsday and tahoe_diskusage) to graph
|
|
the values this tool computes.
|
|
|
|
Each line of urls.txt points to a single node. Each node should have its own
|
|
dedicated disk: if multiple nodes share a disk, only list one of them in
|
|
urls.txt (otherwise that space will be double-counted, confusing the
|
|
results). Each line should be in the form:
|
|
|
|
http://host:webport/statistics?t=json
|
|
|
|
"""
|
|
|
|
# TODO:
|
|
# built-in graphs on web interface
|
|
|
|
|
|
import os.path, urllib
|
|
from datetime import timedelta
|
|
from twisted.application import internet, service, strports
|
|
from twisted.web import server, resource, http, client
|
|
from twisted.internet import defer
|
|
from twisted.python import log
|
|
import simplejson
|
|
from axiom.attributes import AND
|
|
from axiom.store import Store
|
|
from epsilon import extime
|
|
from diskwatcher import Sample
|
|
|
|
#from axiom.item import Item
|
|
#from axiom.attributes import text, integer, timestamp
|
|
|
|
#class Sample(Item):
|
|
# url = text()
|
|
# when = timestamp()
|
|
# used = integer()
|
|
# avail = integer()
|
|
|
|
#s = Store("history.axiom")
|
|
#ns = Store("new-history.axiom")
|
|
#for sa in s.query(Sample):
|
|
# diskwatcher.Sample(store=ns,
|
|
# url=sa.url, when=sa.when, used=sa.used, avail=sa.avail)
|
|
#print "done"
|
|
|
|
HOUR = 3600
|
|
DAY = 24*3600
|
|
WEEK = 7*DAY
|
|
MONTH = 30*DAY
|
|
YEAR = 365*DAY
|
|
|
|
class DiskWatcher(service.MultiService, resource.Resource):
|
|
POLL_INTERVAL = 1*HOUR
|
|
AVERAGES = {#"60s": 60,
|
|
#"5m": 5*60,
|
|
#"30m": 30*60,
|
|
"1hr": 1*HOUR,
|
|
"1day": 1*DAY,
|
|
"2wk": 2*WEEK,
|
|
"4wk": 4*WEEK,
|
|
}
|
|
|
|
def __init__(self):
|
|
assert os.path.exists("diskwatcher.tac") # run from the right directory
|
|
service.MultiService.__init__(self)
|
|
resource.Resource.__init__(self)
|
|
self.store = Store("history.axiom")
|
|
ts = internet.TimerService(self.POLL_INTERVAL, self.poll)
|
|
ts.setServiceParent(self)
|
|
|
|
def startService(self):
|
|
service.MultiService.startService(self)
|
|
|
|
try:
|
|
desired_webport = open("webport", "r").read().strip()
|
|
except EnvironmentError:
|
|
desired_webport = None
|
|
webport = desired_webport or "tcp:0"
|
|
root = self
|
|
serv = strports.service(webport, server.Site(root))
|
|
serv.setServiceParent(self)
|
|
if not desired_webport:
|
|
got_port = serv._port.getHost().port
|
|
open("webport", "w").write("tcp:%d\n" % got_port)
|
|
|
|
|
|
def get_urls(self):
|
|
for url in open("urls.txt","r").readlines():
|
|
if "#" in url:
|
|
url = url[:url.find("#")]
|
|
url = url.strip()
|
|
if not url:
|
|
continue
|
|
yield url
|
|
|
|
def poll(self):
|
|
log.msg("polling..")
|
|
#return self.poll_synchronous()
|
|
return self.poll_asynchronous()
|
|
|
|
def poll_asynchronous(self):
|
|
# this didn't actually seem to work any better than poll_synchronous:
|
|
# logs are more noisy, and I got frequent DNS failures. But with a
|
|
# lot of servers to query, this is probably the better way to go. A
|
|
# significant advantage of this approach is that we can use a
|
|
# timeout= argument to tolerate hanging servers.
|
|
dl = []
|
|
for url in self.get_urls():
|
|
when = extime.Time()
|
|
d = client.getPage(url, timeout=60)
|
|
d.addCallback(self.got_response, when, url)
|
|
dl.append(d)
|
|
d = defer.DeferredList(dl)
|
|
def _done(res):
|
|
fetched = len([1 for (success, value) in res if success])
|
|
log.msg("fetched %d of %d" % (fetched, len(dl)))
|
|
d.addCallback(_done)
|
|
return d
|
|
|
|
def poll_synchronous(self):
|
|
attempts = 0
|
|
fetched = 0
|
|
for url in self.get_urls():
|
|
attempts += 1
|
|
try:
|
|
when = extime.Time()
|
|
# if a server accepts the connection and then hangs, this
|
|
# will block forever
|
|
data_json = urllib.urlopen(url).read()
|
|
self.got_response(data_json, when, url)
|
|
fetched += 1
|
|
except:
|
|
log.msg("error while fetching: %s" % url)
|
|
log.err()
|
|
log.msg("fetched %d of %d" % (fetched, attempts))
|
|
|
|
def got_response(self, data_json, when, url):
|
|
data = simplejson.loads(data_json)
|
|
total = data[u"stats"][u"storage_server.disk_total"]
|
|
used = data[u"stats"][u"storage_server.disk_used"]
|
|
avail = data[u"stats"][u"storage_server.disk_avail"]
|
|
print "%s : total=%s, used=%s, avail=%s" % (url,
|
|
total, used, avail)
|
|
Sample(store=self.store,
|
|
url=unicode(url), when=when, used=used, avail=avail)
|
|
|
|
def calculate_growth_timeleft(self):
|
|
timespans = []
|
|
total_avail_space = self.find_total_available_space()
|
|
pairs = [ (timespan,name)
|
|
for name,timespan in self.AVERAGES.items() ]
|
|
pairs.sort()
|
|
for (timespan,name) in pairs:
|
|
growth = self.growth(timespan)
|
|
print name, total_avail_space, growth
|
|
if growth is not None:
|
|
timeleft = None
|
|
if growth > 0:
|
|
timeleft = total_avail_space / growth
|
|
timespans.append( (name, timespan, growth, timeleft) )
|
|
return timespans
|
|
|
|
def find_total_available_space(self):
|
|
# this returns the sum of disk-avail stats for all servers that 1)
|
|
# are listed in urls.txt and 2) have responded recently.
|
|
now = extime.Time()
|
|
recent = now - timedelta(seconds=2*self.POLL_INTERVAL)
|
|
total_avail_space = 0
|
|
for url in self.get_urls():
|
|
url = unicode(url)
|
|
latest = list(self.store.query(Sample,
|
|
AND(Sample.url == url,
|
|
Sample.when > recent),
|
|
sort=Sample.when.descending,
|
|
limit=1))
|
|
if latest:
|
|
total_avail_space += latest[0].avail
|
|
return total_avail_space
|
|
|
|
def find_total_used_space(self):
|
|
# this returns the sum of disk-used stats for all servers that 1) are
|
|
# listed in urls.txt and 2) have responded recently.
|
|
now = extime.Time()
|
|
recent = now - timedelta(seconds=2*self.POLL_INTERVAL)
|
|
total_used_space = 0
|
|
for url in self.get_urls():
|
|
url = unicode(url)
|
|
latest = list(self.store.query(Sample,
|
|
AND(Sample.url == url,
|
|
Sample.when > recent),
|
|
sort=Sample.when.descending,
|
|
limit=1))
|
|
if latest:
|
|
total_used_space += latest[0].used
|
|
return total_used_space
|
|
|
|
|
|
def growth(self, timespan):
|
|
"""Calculate the bytes-per-second growth of the total disk-used stat,
|
|
over a period of TIMESPAN seconds (i.e. between the most recent
|
|
sample and the latest one that's at least TIMESPAN seconds ago),
|
|
summed over all nodes which 1) are listed in urls.txt, 2) have
|
|
responded recently, and 3) have a response at least as old as
|
|
TIMESPAN. If there are no nodes which meet these criteria, we'll
|
|
return None; this is likely to happen for the longer timespans (4wk)
|
|
until the gatherer has been running and collecting data for that
|
|
long."""
|
|
|
|
td = timedelta(seconds=timespan)
|
|
now = extime.Time()
|
|
then = now - td
|
|
recent = now - timedelta(seconds=2*self.POLL_INTERVAL)
|
|
|
|
total_growth = 0.0
|
|
num_nodes = 0
|
|
|
|
for url in self.get_urls():
|
|
url = unicode(url)
|
|
latest = list(self.store.query(Sample,
|
|
AND(Sample.url == url,
|
|
Sample.when > recent),
|
|
sort=Sample.when.descending,
|
|
limit=1))
|
|
if not latest:
|
|
#print "no latest sample from", url
|
|
continue # skip this node
|
|
latest = latest[0]
|
|
old = list(self.store.query(Sample,
|
|
AND(Sample.url == url,
|
|
Sample.when < then),
|
|
sort=Sample.when.descending,
|
|
limit=1))
|
|
if not old:
|
|
#print "no old sample from", url
|
|
continue # skip this node
|
|
old = old[0]
|
|
duration = latest.when.asPOSIXTimestamp() - old.when.asPOSIXTimestamp()
|
|
if not duration:
|
|
print "only one sample from", url
|
|
continue
|
|
|
|
rate = float(latest.used - old.used) / duration
|
|
#print url, rate
|
|
total_growth += rate
|
|
num_nodes += 1
|
|
|
|
if not num_nodes:
|
|
return None
|
|
return total_growth
|
|
|
|
def getChild(self, path, req):
|
|
if path == "":
|
|
return self
|
|
return resource.Resource.getChild(self, path, req)
|
|
|
|
def abbreviate_time(self, s):
|
|
def _plural(count, unit):
|
|
count = int(count)
|
|
if count == 1:
|
|
return "%d %s" % (count, unit)
|
|
return "%d %ss" % (count, unit)
|
|
if s is None:
|
|
return "unknown"
|
|
if s < 120:
|
|
return _plural(s, "second")
|
|
if s < 3*HOUR:
|
|
return _plural(s/60, "minute")
|
|
if s < 2*DAY:
|
|
return _plural(s/HOUR, "hour")
|
|
if s < 2*MONTH:
|
|
return _plural(s/DAY, "day")
|
|
if s < 4*YEAR:
|
|
return _plural(s/MONTH, "month")
|
|
return _plural(s/YEAR, "year")
|
|
|
|
def render(self, req):
|
|
t = req.args.get("t", ["html"])[0]
|
|
ctype = "text/plain"
|
|
data = ""
|
|
if t == "html":
|
|
data = ""
|
|
for (name, timespan, growth, timeleft) in self.calculate_growth_timeleft():
|
|
data += "%f bytes per second, %s remaining (over %s)\n" % \
|
|
(growth, self.abbreviate_time(timeleft), name)
|
|
used = self.find_total_used_space()
|
|
data += "total used: %d bytes\n" % used
|
|
elif t == "json":
|
|
current = {"rates": self.calculate_growth_timeleft(),
|
|
"used": self.find_total_used_space(),
|
|
"available": self.find_total_available_space(),
|
|
}
|
|
data = simplejson.dumps(current, indent=True)
|
|
else:
|
|
req.setResponseCode(http.BAD_REQUEST)
|
|
data = "Unknown t= %s\n" % t
|
|
req.setHeader("content-type", ctype)
|
|
return data
|
|
|
|
application = service.Application("disk-watcher")
|
|
DiskWatcher().setServiceParent(application)
|