mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2025-06-22 08:50:07 +00:00
server.py: get_latencies now reports percentiles _only_ if there are sufficient observations for the interpretation of the percentile to be unambiguous.
interfaces.py: modified the return type of RIStatsProvider.get_stats to allow for None as a return value NEWS.rst, stats.py: documentation of change to get_latencies stats.rst: now documents percentile modification in get_latencies test_storage.py: test_latencies now expects None in output categories that contain too few samples for the associated percentile to be unambiguously reported. fixes #1392
This commit is contained in:
12
NEWS.rst
12
NEWS.rst
@ -1,7 +1,17 @@
|
|||||||
==================================
|
==================================
|
||||||
User-Visible Changes in Tahoe-LAFS
|
User-Visible Changes in Tahoe-LAFS
|
||||||
==================================
|
==================================
|
||||||
|
|
||||||
|
Release 1.9.0 (2011-??-??)
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
|
||||||
|
- Nodes now emit "None" for percentiles with higher implied precision
|
||||||
|
than the number of observations can support. Older stats gatherers
|
||||||
|
will throw an exception if they gather stats from a new storage
|
||||||
|
server and it sends a "None" for a percentile. (`#1392`_)
|
||||||
|
|
||||||
|
|
||||||
Release 1.8.2 (2011-01-30)
|
Release 1.8.2 (2011-01-30)
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
================
|
================
|
||||||
Tahoe Statistics
|
Tahoe Statistics
|
||||||
================
|
================
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ The currently available stats (as of release 1.6.0 or so) are described here:
|
|||||||
this group counts inbound storage-server operations. They are not provided
|
this group counts inbound storage-server operations. They are not provided
|
||||||
by client-only nodes which have been configured to not run a storage server
|
by client-only nodes which have been configured to not run a storage server
|
||||||
(with [storage]enabled=false in tahoe.cfg)
|
(with [storage]enabled=false in tahoe.cfg)
|
||||||
|
|
||||||
allocate, write, close, abort
|
allocate, write, close, abort
|
||||||
these are for immutable file uploads. 'allocate' is incremented when a
|
these are for immutable file uploads. 'allocate' is incremented when a
|
||||||
client asks if it can upload a share to the server. 'write' is
|
client asks if it can upload a share to the server. 'write' is
|
||||||
@ -134,6 +134,14 @@ The currently available stats (as of release 1.6.0 or so) are described here:
|
|||||||
999 out of the last 1000 operations were faster than the
|
999 out of the last 1000 operations were faster than the
|
||||||
given number, and is the same threshold used by Amazon's
|
given number, and is the same threshold used by Amazon's
|
||||||
internal SLA, according to the Dynamo paper).
|
internal SLA, according to the Dynamo paper).
|
||||||
|
Percentiles are only reported in the case of a sufficient
|
||||||
|
number of observations for unambiguous interpretation. For
|
||||||
|
example, the 99.9th percentile is (at the level of thousandths
|
||||||
|
precision) 9 thousandths greater than the 99th
|
||||||
|
percentile for sample sizes greater than or equal to 1000,
|
||||||
|
thus the 99.9th percentile is only reported for samples of 1000
|
||||||
|
or more observations.
|
||||||
|
|
||||||
|
|
||||||
**counters.uploader.files_uploaded**
|
**counters.uploader.files_uploaded**
|
||||||
|
|
||||||
@ -195,7 +203,7 @@ The currently available stats (as of release 1.6.0 or so) are described here:
|
|||||||
|
|
||||||
active_uploads
|
active_uploads
|
||||||
how many files are currently being uploaded. 0 when idle.
|
how many files are currently being uploaded. 0 when idle.
|
||||||
|
|
||||||
incoming_count
|
incoming_count
|
||||||
how many cache files are present in the incoming/ directory,
|
how many cache files are present in the incoming/ directory,
|
||||||
which holds ciphertext files that are still being fetched
|
which holds ciphertext files that are still being fetched
|
||||||
|
@ -2390,12 +2390,12 @@ class RIStatsProvider(RemoteInterface):
|
|||||||
def get_stats():
|
def get_stats():
|
||||||
"""
|
"""
|
||||||
returns a dictionary containing 'counters' and 'stats', each a
|
returns a dictionary containing 'counters' and 'stats', each a
|
||||||
dictionary with string counter/stat name keys, and numeric values.
|
dictionary with string counter/stat name keys, and numeric or None values.
|
||||||
counters are monotonically increasing measures of work done, and
|
counters are monotonically increasing measures of work done, and
|
||||||
stats are instantaneous measures (potentially time averaged
|
stats are instantaneous measures (potentially time averaged
|
||||||
internally)
|
internally)
|
||||||
"""
|
"""
|
||||||
return DictOf(str, DictOf(str, ChoiceOf(float, int, long)))
|
return DictOf(str, DictOf(str, ChoiceOf(float, int, long, None)))
|
||||||
|
|
||||||
class RIStatsGatherer(RemoteInterface):
|
class RIStatsGatherer(RemoteInterface):
|
||||||
__remote_name__ = "RIStatsGatherer.tahoe.allmydata.com"
|
__remote_name__ = "RIStatsGatherer.tahoe.allmydata.com"
|
||||||
|
@ -116,12 +116,15 @@ class StorageServer(service.MultiService, Referenceable):
|
|||||||
|
|
||||||
def get_latencies(self):
|
def get_latencies(self):
|
||||||
"""Return a dict, indexed by category, that contains a dict of
|
"""Return a dict, indexed by category, that contains a dict of
|
||||||
latency numbers for each category. Each dict will contain the
|
latency numbers for each category. If there are sufficient samples
|
||||||
|
for unambiguous interpretation, each dict will contain the
|
||||||
following keys: mean, 01_0_percentile, 10_0_percentile,
|
following keys: mean, 01_0_percentile, 10_0_percentile,
|
||||||
50_0_percentile (median), 90_0_percentile, 95_0_percentile,
|
50_0_percentile (median), 90_0_percentile, 95_0_percentile,
|
||||||
99_0_percentile, 99_9_percentile. If no samples have been collected
|
99_0_percentile, 99_9_percentile. If there are insufficient
|
||||||
for the given category, then that category name will not be present
|
samples for a given percentile to be interpreted unambiguously
|
||||||
in the return value."""
|
that percentile will be reported as None. If no samples have been
|
||||||
|
collected for the given category, then that category name will
|
||||||
|
not be present in the return value. """
|
||||||
# note that Amazon's Dynamo paper says they use 99.9% percentile.
|
# note that Amazon's Dynamo paper says they use 99.9% percentile.
|
||||||
output = {}
|
output = {}
|
||||||
for category in self.latencies:
|
for category in self.latencies:
|
||||||
@ -129,16 +132,25 @@ class StorageServer(service.MultiService, Referenceable):
|
|||||||
continue
|
continue
|
||||||
stats = {}
|
stats = {}
|
||||||
samples = self.latencies[category][:]
|
samples = self.latencies[category][:]
|
||||||
samples.sort()
|
|
||||||
count = len(samples)
|
count = len(samples)
|
||||||
stats["mean"] = sum(samples) / count
|
stats["samplesize"] = count
|
||||||
stats["01_0_percentile"] = samples[int(0.01 * count)]
|
samples.sort()
|
||||||
stats["10_0_percentile"] = samples[int(0.1 * count)]
|
if count > 1:
|
||||||
stats["50_0_percentile"] = samples[int(0.5 * count)]
|
stats["mean"] = sum(samples) / count
|
||||||
stats["90_0_percentile"] = samples[int(0.9 * count)]
|
else:
|
||||||
stats["95_0_percentile"] = samples[int(0.95 * count)]
|
stats["mean"] = None
|
||||||
stats["99_0_percentile"] = samples[int(0.99 * count)]
|
|
||||||
stats["99_9_percentile"] = samples[int(0.999 * count)]
|
orderstatlist = [(0.01, "01_0_percentile", 100), (0.1, "10_0_percentile", 10),\
|
||||||
|
(0.50, "50_0_percentile", 10), (0.90, "90_0_percentile", 10),\
|
||||||
|
(0.95, "95_0_percentile", 20), (0.99, "99_0_percentile", 100),\
|
||||||
|
(0.999, "99_9_percentile", 1000)]
|
||||||
|
|
||||||
|
for percentile, percentilestring, minnumtoobserve in orderstatlist:
|
||||||
|
if count >= minnumtoobserve:
|
||||||
|
stats[percentilestring] = samples[int(percentile*count)]
|
||||||
|
else:
|
||||||
|
stats[percentilestring] = None
|
||||||
|
|
||||||
output[category] = stats
|
output[category] = stats
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@ -551,4 +563,3 @@ class StorageServer(service.MultiService, Referenceable):
|
|||||||
share_type=share_type, si=si_s, shnum=shnum, reason=reason,
|
share_type=share_type, si=si_s, shnum=shnum, reason=reason,
|
||||||
level=log.SCARY, umid="SGx2fA")
|
level=log.SCARY, umid="SGx2fA")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -1311,6 +1311,8 @@ class Stats(unittest.TestCase):
|
|||||||
ss.add_latency("allocate", 1.0 * i)
|
ss.add_latency("allocate", 1.0 * i)
|
||||||
for i in range(1000):
|
for i in range(1000):
|
||||||
ss.add_latency("renew", 1.0 * i)
|
ss.add_latency("renew", 1.0 * i)
|
||||||
|
for i in range(20):
|
||||||
|
ss.add_latency("write", 1.0 * i)
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
ss.add_latency("cancel", 2.0 * i)
|
ss.add_latency("cancel", 2.0 * i)
|
||||||
ss.add_latency("get", 5.0)
|
ss.add_latency("get", 5.0)
|
||||||
@ -1318,7 +1320,7 @@ class Stats(unittest.TestCase):
|
|||||||
output = ss.get_latencies()
|
output = ss.get_latencies()
|
||||||
|
|
||||||
self.failUnlessEqual(sorted(output.keys()),
|
self.failUnlessEqual(sorted(output.keys()),
|
||||||
sorted(["allocate", "renew", "cancel", "get"]))
|
sorted(["allocate", "renew", "cancel", "write", "get"]))
|
||||||
self.failUnlessEqual(len(ss.latencies["allocate"]), 1000)
|
self.failUnlessEqual(len(ss.latencies["allocate"]), 1000)
|
||||||
self.failUnless(abs(output["allocate"]["mean"] - 9500) < 1, output)
|
self.failUnless(abs(output["allocate"]["mean"] - 9500) < 1, output)
|
||||||
self.failUnless(abs(output["allocate"]["01_0_percentile"] - 9010) < 1, output)
|
self.failUnless(abs(output["allocate"]["01_0_percentile"] - 9010) < 1, output)
|
||||||
@ -1339,25 +1341,35 @@ class Stats(unittest.TestCase):
|
|||||||
self.failUnless(abs(output["renew"]["99_0_percentile"] - 990) < 1, output)
|
self.failUnless(abs(output["renew"]["99_0_percentile"] - 990) < 1, output)
|
||||||
self.failUnless(abs(output["renew"]["99_9_percentile"] - 999) < 1, output)
|
self.failUnless(abs(output["renew"]["99_9_percentile"] - 999) < 1, output)
|
||||||
|
|
||||||
|
self.failUnlessEqual(len(ss.latencies["write"]), 20)
|
||||||
|
self.failUnless(abs(output["write"]["mean"] - 9) < 1, output)
|
||||||
|
self.failUnless(output["write"]["01_0_percentile"] is None, output)
|
||||||
|
self.failUnless(abs(output["write"]["10_0_percentile"] - 2) < 1, output)
|
||||||
|
self.failUnless(abs(output["write"]["50_0_percentile"] - 10) < 1, output)
|
||||||
|
self.failUnless(abs(output["write"]["90_0_percentile"] - 18) < 1, output)
|
||||||
|
self.failUnless(abs(output["write"]["95_0_percentile"] - 19) < 1, output)
|
||||||
|
self.failUnless(output["write"]["99_0_percentile"] is None, output)
|
||||||
|
self.failUnless(output["write"]["99_9_percentile"] is None, output)
|
||||||
|
|
||||||
self.failUnlessEqual(len(ss.latencies["cancel"]), 10)
|
self.failUnlessEqual(len(ss.latencies["cancel"]), 10)
|
||||||
self.failUnless(abs(output["cancel"]["mean"] - 9) < 1, output)
|
self.failUnless(abs(output["cancel"]["mean"] - 9) < 1, output)
|
||||||
self.failUnless(abs(output["cancel"]["01_0_percentile"] - 0) < 1, output)
|
self.failUnless(output["cancel"]["01_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["cancel"]["10_0_percentile"] - 2) < 1, output)
|
self.failUnless(abs(output["cancel"]["10_0_percentile"] - 2) < 1, output)
|
||||||
self.failUnless(abs(output["cancel"]["50_0_percentile"] - 10) < 1, output)
|
self.failUnless(abs(output["cancel"]["50_0_percentile"] - 10) < 1, output)
|
||||||
self.failUnless(abs(output["cancel"]["90_0_percentile"] - 18) < 1, output)
|
self.failUnless(abs(output["cancel"]["90_0_percentile"] - 18) < 1, output)
|
||||||
self.failUnless(abs(output["cancel"]["95_0_percentile"] - 18) < 1, output)
|
self.failUnless(output["cancel"]["95_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["cancel"]["99_0_percentile"] - 18) < 1, output)
|
self.failUnless(output["cancel"]["99_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["cancel"]["99_9_percentile"] - 18) < 1, output)
|
self.failUnless(output["cancel"]["99_9_percentile"] is None, output)
|
||||||
|
|
||||||
self.failUnlessEqual(len(ss.latencies["get"]), 1)
|
self.failUnlessEqual(len(ss.latencies["get"]), 1)
|
||||||
self.failUnless(abs(output["get"]["mean"] - 5) < 1, output)
|
self.failUnless(output["get"]["mean"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["01_0_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["01_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["10_0_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["10_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["50_0_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["50_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["90_0_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["90_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["95_0_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["95_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["99_0_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["99_0_percentile"] is None, output)
|
||||||
self.failUnless(abs(output["get"]["99_9_percentile"] - 5) < 1, output)
|
self.failUnless(output["get"]["99_9_percentile"] is None, output)
|
||||||
|
|
||||||
def remove_tags(s):
|
def remove_tags(s):
|
||||||
s = re.sub(r'<[^>]*>', ' ', s)
|
s = re.sub(r'<[^>]*>', ' ', s)
|
||||||
|
Reference in New Issue
Block a user