mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2025-02-20 09:46:18 +00:00
misc/cpu-watcher*: add some tools to monitor CPU usage of arbitrary processes, like tahoe nodes
This commit is contained in:
parent
c1184f51e4
commit
79147a3f6a
24
misc/cpu-watcher-poll.py
Normal file
24
misc/cpu-watcher-poll.py
Normal file
@ -0,0 +1,24 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
from foolscap import Tub, eventual
|
||||
from twisted.internet import reactor
|
||||
import sys
|
||||
import pprint
|
||||
|
||||
def oops(f):
|
||||
print "ERROR"
|
||||
print f
|
||||
|
||||
def fetch(furl):
|
||||
t = Tub()
|
||||
t.startService()
|
||||
d = t.getReference(furl)
|
||||
d.addCallback(lambda rref: rref.callRemote("get_averages"))
|
||||
d.addCallback(pprint.pprint)
|
||||
return d
|
||||
|
||||
d = eventual.fireEventually(sys.argv[1])
|
||||
d.addCallback(fetch)
|
||||
d.addErrback(oops)
|
||||
d.addBoth(lambda res: reactor.stop())
|
||||
reactor.run()
|
49
misc/cpu-watcher-subscribe.py
Normal file
49
misc/cpu-watcher-subscribe.py
Normal file
@ -0,0 +1,49 @@
|
||||
# -*- python -*-
|
||||
|
||||
from twisted.internet import reactor
|
||||
import sys
|
||||
|
||||
import os.path, pprint
|
||||
from twisted.application import service
|
||||
from twisted.python import log
|
||||
from foolscap import Tub, Referenceable, RemoteInterface
|
||||
from foolscap.schema import ListOf, TupleOf
|
||||
from zope.interface import implements
|
||||
|
||||
Averages = ListOf( TupleOf(str, float, float, float) )
|
||||
class RICPUWatcherSubscriber(RemoteInterface):
|
||||
def averages(averages=Averages):
|
||||
return None
|
||||
|
||||
class CPUWatcherSubscriber(service.MultiService, Referenceable):
|
||||
implements(RICPUWatcherSubscriber)
|
||||
def __init__(self, furlthing):
|
||||
service.MultiService.__init__(self)
|
||||
if furlthing.startswith("pb://"):
|
||||
furl = furlthing
|
||||
else:
|
||||
furlfile = os.path.expanduser(furlthing)
|
||||
if os.path.isdir(furlfile):
|
||||
furlfile = os.path.join(furlfile, "watcher.furl")
|
||||
furl = open(furlfile, "r").read().strip()
|
||||
tub = Tub()
|
||||
tub.setServiceParent(self)
|
||||
tub.connectTo(furl, self.connected)
|
||||
|
||||
def connected(self, rref):
|
||||
print "subscribing"
|
||||
d = rref.callRemote("get_averages")
|
||||
d.addCallback(self.remote_averages)
|
||||
d.addErrback(log.err)
|
||||
|
||||
d = rref.callRemote("subscribe", self)
|
||||
d.addErrback(log.err)
|
||||
|
||||
def remote_averages(self, averages):
|
||||
pprint.pprint(averages)
|
||||
|
||||
|
||||
c = CPUWatcherSubscriber(sys.argv[1])
|
||||
c.startService()
|
||||
reactor.run()
|
||||
|
221
misc/cpu-watcher.tac
Normal file
221
misc/cpu-watcher.tac
Normal file
@ -0,0 +1,221 @@
|
||||
# -*- python -*-
|
||||
|
||||
"""
|
||||
# run this tool on a linux box in its own directory, with a file named
|
||||
# 'pids.txt' describing which processes to watch. It will follow CPU usage of
|
||||
# the given processes, and compute 1/5/15-minute moving averages for each
|
||||
# process. These averages can be retrieved from a foolscap connection
|
||||
# (published at ./watcher.furl), or through an HTTP query (using ./webport).
|
||||
|
||||
# Each line of pids.txt describes a single process. Blank lines and ones that
|
||||
# begin with '#' are ignored. Each line is either "PID" or "PID NAME" (space
|
||||
# separated). PID is either a numeric process ID, a pathname to a file that
|
||||
# contains a process ID, or a pathname to a directory that contains a
|
||||
# twistd.pid file (which contains a process ID). NAME is an arbitrary string
|
||||
# that will be used to describe the process to watcher.furl subscribers, and
|
||||
# defaults to PID if not provided.
|
||||
"""
|
||||
|
||||
# TODO:
|
||||
# built-in graphs on web interface
|
||||
|
||||
|
||||
|
||||
import pickle, os.path, time, pprint
|
||||
from twisted.application import internet, service, strports
|
||||
from twisted.web import server, resource, http
|
||||
from twisted.python import log
|
||||
import simplejson
|
||||
from foolscap import Tub, Referenceable, RemoteInterface, eventual
|
||||
from foolscap.schema import ListOf, TupleOf
|
||||
from zope.interface import implements
|
||||
|
||||
def read_cpu_times(pid):
|
||||
data = open("/proc/%d/stat" % pid, "r").read()
|
||||
data = data.split()
|
||||
times = data[13:17]
|
||||
# the values in /proc/%d/stat are in ticks, I think. My system has
|
||||
# CONFIG_HZ_1000=y in /proc/config.gz but nevertheless the numbers in
|
||||
# 'stat' appear to be 10ms each.
|
||||
HZ = 100
|
||||
userspace_seconds = int(times[0]) * 1.0 / HZ
|
||||
system_seconds = int(times[1]) * 1.0 / HZ
|
||||
child_userspace_seconds = int(times[2]) * 1.0 / HZ
|
||||
child_system_seconds = int(times[3]) * 1.0 / HZ
|
||||
return (userspace_seconds, system_seconds)
|
||||
|
||||
|
||||
def read_pids_txt():
|
||||
processes = []
|
||||
for line in open("pids.txt", "r").readlines():
|
||||
line = line.strip()
|
||||
if not line or line[0] == "#":
|
||||
continue
|
||||
parts = line.split()
|
||||
pidthing = parts[0]
|
||||
if len(parts) > 1:
|
||||
name = parts[1]
|
||||
else:
|
||||
name = pidthing
|
||||
try:
|
||||
pid = int(pidthing)
|
||||
except ValueError:
|
||||
pidfile = os.path.expanduser(pidthing)
|
||||
if os.path.isdir(pidfile):
|
||||
pidfile = os.path.join(pidfile, "twistd.pid")
|
||||
pid = int(open(pidfile, "r").read().strip())
|
||||
processes.append( (pid, name) )
|
||||
return processes
|
||||
|
||||
Averages = ListOf( TupleOf(str, float, float, float) )
|
||||
class RICPUWatcherSubscriber(RemoteInterface):
|
||||
def averages(averages=Averages):
|
||||
return None
|
||||
|
||||
class RICPUWatcher(RemoteInterface):
|
||||
def get_averages():
|
||||
"""Return a list of rows, one for each process I am watching. Each
|
||||
row is (name, 1-min-avg, 5-min-avg, 15-min-avg), where 'name' is a
|
||||
string, and the averages are floats from 0.0 to 1.0 . Each average is
|
||||
the percentage of the CPU that this process has used: the change in
|
||||
CPU time divided by the change in wallclock time.
|
||||
"""
|
||||
return Averages
|
||||
|
||||
def subscribe(observer=RICPUWatcherSubscriber):
|
||||
"""Arrange for the given observer to get an 'averages' message every
|
||||
time the averages are updated. This message will contain a single
|
||||
argument, the same list of tuples that get_averages() returns."""
|
||||
return None
|
||||
|
||||
class CPUWatcher(service.MultiService, resource.Resource, Referenceable):
|
||||
implements(RICPUWatcher)
|
||||
POLL_INTERVAL = 30 # seconds
|
||||
HISTORY_LIMIT = 15 * 60 # 15min
|
||||
AVERAGES = (1*60, 5*60, 15*60) # 1min, 5min, 15min
|
||||
|
||||
def __init__(self):
|
||||
service.MultiService.__init__(self)
|
||||
resource.Resource.__init__(self)
|
||||
try:
|
||||
self.history = pickle.load(open("history.pickle", "rb"))
|
||||
except EnvironmentError:
|
||||
self.history = {}
|
||||
self.current = []
|
||||
self.observers = set()
|
||||
ts = internet.TimerService(self.POLL_INTERVAL, self.poll)
|
||||
ts.setServiceParent(self)
|
||||
|
||||
def startService(self):
|
||||
service.MultiService.startService(self)
|
||||
|
||||
try:
|
||||
desired_webport = open("webport", "r").read().strip()
|
||||
except EnvironmentError:
|
||||
desired_webport = None
|
||||
webport = desired_webport or "tcp:0"
|
||||
root = self
|
||||
serv = strports.service(webport, server.Site(root))
|
||||
serv.setServiceParent(self)
|
||||
if not desired_webport:
|
||||
got_port = serv._port.getHost().port
|
||||
open("webport", "w").write("tcp:%d\n" % got_port)
|
||||
|
||||
self.tub = Tub(certFile="watcher.pem")
|
||||
self.tub.setServiceParent(self)
|
||||
try:
|
||||
desired_tubport = open("tubport", "r").read().strip()
|
||||
except EnvironmentError:
|
||||
desired_tubport = None
|
||||
tubport = desired_tubport or "tcp:0"
|
||||
l = self.tub.listenOn(tubport)
|
||||
if not desired_tubport:
|
||||
got_port = l.getPortnum()
|
||||
open("tubport", "w").write("tcp:%d\n" % got_port)
|
||||
d = self.tub.setLocationAutomatically()
|
||||
d.addCallback(self._tub_ready)
|
||||
d.addErrback(log.err)
|
||||
|
||||
def _tub_ready(self, res):
|
||||
self.tub.registerReference(self, furlFile="watcher.furl")
|
||||
|
||||
|
||||
def getChild(self, path, req):
|
||||
if path == "":
|
||||
return self
|
||||
return resource.Resource.getChild(self, path, req)
|
||||
|
||||
def render(self, req):
|
||||
t = req.args.get("t", ["html"])[0]
|
||||
ctype = "text/plain"
|
||||
data = ""
|
||||
if t == "html":
|
||||
data = "# name, 1min, 5min, 15min\n"
|
||||
data += pprint.pformat(self.current) + "\n"
|
||||
elif t == "json":
|
||||
#data = str(self.current) + "\n" # isn't that convenient? almost.
|
||||
data = simplejson.dumps(self.current, indent=True)
|
||||
else:
|
||||
req.setResponseCode(http.BAD_REQUEST)
|
||||
data = "Unknown t= %s\n" % t
|
||||
req.setHeader("content-type", ctype)
|
||||
return data
|
||||
|
||||
def remote_get_averages(self):
|
||||
return self.current
|
||||
def remote_subscribe(self, observer):
|
||||
self.observers.add(observer)
|
||||
|
||||
def notify(self, observer):
|
||||
d = observer.callRemote("averages", self.current)
|
||||
def _error(f):
|
||||
log.msg("observer error, removing them")
|
||||
log.msg(f)
|
||||
self.observers.discard(observer)
|
||||
d.addErrback(_error)
|
||||
|
||||
def poll(self):
|
||||
max_history = self.HISTORY_LIMIT / self.POLL_INTERVAL
|
||||
current = []
|
||||
try:
|
||||
processes = read_pids_txt()
|
||||
except:
|
||||
log.err()
|
||||
return
|
||||
for (pid, name) in processes:
|
||||
if pid not in self.history:
|
||||
self.history[pid] = []
|
||||
now = time.time()
|
||||
try:
|
||||
(user_seconds, sys_seconds) = read_cpu_times(pid)
|
||||
self.history[pid].append( (now, user_seconds, sys_seconds) )
|
||||
while len(self.history[pid]) > max_history+1:
|
||||
self.history[pid].pop(0)
|
||||
except:
|
||||
log.err()
|
||||
pickle.dump(self.history, open("history.pickle", "wb"))
|
||||
for (pid, name) in processes:
|
||||
row = [name]
|
||||
for avg in self.AVERAGES:
|
||||
row.append(self._average_N(pid, avg))
|
||||
current.append(tuple(row))
|
||||
self.current = current
|
||||
print current
|
||||
for ob in self.observers:
|
||||
eventual.eventually(self.notify, ob)
|
||||
|
||||
def _average_N(self, pid, seconds):
|
||||
num_samples = seconds / self.POLL_INTERVAL
|
||||
samples = self.history[pid]
|
||||
if len(samples) < num_samples+1:
|
||||
return None
|
||||
first = -num_samples-1
|
||||
elapsed_wall = samples[-1][0] - samples[first][0]
|
||||
elapsed_user = samples[-1][1] - samples[first][1]
|
||||
elapsed_sys = samples[-1][2] - samples[first][2]
|
||||
if elapsed_wall == 0.0:
|
||||
return 0.0
|
||||
return (elapsed_user+elapsed_sys) / elapsed_wall
|
||||
|
||||
application = service.Application("cpu-watcher")
|
||||
CPUWatcher().setServiceParent(application)
|
Loading…
x
Reference in New Issue
Block a user