deep-stats: add file-size histogram

This commit is contained in:
Brian Warner 2008-05-08 16:19:42 -07:00
parent a1ea3d9b37
commit fabdc28c06
5 changed files with 119 additions and 3 deletions

View File

@ -585,6 +585,9 @@ GET $URL?t=deep-stats
size-mutable-files (TODO): same, for current version of all mutable files
size-literal-files: same, for LIT files
size-directories: size of directories (includes size-literal-files)
size-files-histogram: list of (minsize, maxsize, count) buckets,
with a histogram of filesizes, 5dB/bucket,
for both literal and immutable files
largest-directory: number of children in the largest directory
largest-immutable-file: number of bytes in the largest CHK file

View File

@ -1,5 +1,5 @@
import os, time
import os, time, math
from zope.interface import implements
from twisted.internet import defer
@ -8,7 +8,7 @@ from allmydata.mutable.common import NotMutableError
from allmydata.mutable.node import MutableFileNode
from allmydata.interfaces import IMutableFileNode, IDirectoryNode,\
IURI, IFileNode, IMutableFileURI, IVerifierURI, IFilesystemNode
from allmydata.util import hashutil
from allmydata.util import hashutil, mathutil
from allmydata.util.hashutil import netstring
from allmydata.util.limiter import ConcurrencyLimiter
from allmydata.uri import NewDirectoryURI
@ -514,6 +514,7 @@ class NewDirectoryNode:
elif IFileNode.providedBy(child): # CHK and LIT
stats.add("count-files")
size = child.get_size()
stats.histogram("size-files-histogram", size)
if child.get_uri().startswith("URI:LIT:"):
stats.add("count-literal-files")
stats.add("size-literal-files", size)
@ -544,6 +545,11 @@ class DeepStats:
#"largest-mutable-file",
]:
self.stats[k] = 0
self.histograms = {}
for k in ["size-files-histogram"]:
self.histograms[k] = {} # maps (min,max) to count
self.buckets = [ (0,0), (1,3)]
self.root = math.sqrt(10)
def add(self, key, value=1):
self.stats[key] += value
@ -551,8 +557,38 @@ class DeepStats:
def max(self, key, value):
self.stats[key] = max(self.stats[key], value)
def which_bucket(self, size):
# return (min,max) such that min <= size <= max
# values are from the set (0,0), (1,3), (4,10), (11,31), (32,100),
# (101,316), (317, 1000), etc: two per decade
assert size >= 0
i = 0
while True:
if i >= len(self.buckets):
# extend the list
new_lower = self.buckets[i-1][1]+1
new_upper = int(mathutil.next_power_of_k(new_lower, self.root))
self.buckets.append( (new_lower, new_upper) )
maybe = self.buckets[i]
if maybe[0] <= size <= maybe[1]:
return maybe
i += 1
def histogram(self, key, size):
bucket = self.which_bucket(size)
h = self.histograms[key]
if bucket not in h:
h[bucket] = 0
h[bucket] += 1
def get_results(self):
return self.stats
stats = self.stats.copy()
for key in self.histograms:
h = self.histograms[key]
out = [ (bucket[0], bucket[1], h[bucket]) for bucket in h ]
out.sort()
stats[key] = out
return stats
# use client.create_dirnode() to make one of these

View File

@ -166,6 +166,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
d = self.client.create_empty_dirnode()
def _then(n):
# /
self.failUnless(n.is_mutable())
u = n.get_uri()
self.failUnless(u)
@ -186,8 +187,13 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
assert isinstance(ffu_v, str)
self.expected_manifest.append(ffu_v)
d.addCallback(lambda res: n.set_uri(u"child", fake_file_uri))
# /
# /child = mutable
d.addCallback(lambda res: n.create_empty_directory(u"subdir"))
# /
# /child = mutable
# /subdir = directory
def _created(subdir):
self.failUnless(isinstance(subdir, FakeDirectoryNode))
self.subdir = subdir
@ -230,6 +236,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
stats["size-directories"])
self.failUnless(stats["largest-directory"] > 500,
stats["largest-directory"])
self.failUnlessEqual(stats["size-files-histogram"], [])
d.addCallback(_check_deepstats)
def _add_subsubdir(res):
@ -458,6 +465,49 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
return d
class DeepStats(unittest.TestCase):
def test_stats(self):
ds = dirnode.DeepStats()
ds.add("count-files")
ds.add("size-immutable-files", 123)
ds.histogram("size-files-histogram", 123)
ds.max("largest-directory", 444)
s = ds.get_results()
self.failUnlessEqual(s["count-files"], 1)
self.failUnlessEqual(s["size-immutable-files"], 123)
self.failUnlessEqual(s["largest-directory"], 444)
self.failUnlessEqual(s["count-literal-files"], 0)
ds.add("count-files")
ds.add("size-immutable-files", 321)
ds.histogram("size-files-histogram", 321)
ds.max("largest-directory", 2)
s = ds.get_results()
self.failUnlessEqual(s["count-files"], 2)
self.failUnlessEqual(s["size-immutable-files"], 444)
self.failUnlessEqual(s["largest-directory"], 444)
self.failUnlessEqual(s["count-literal-files"], 0)
self.failUnlessEqual(s["size-files-histogram"],
[ (101, 316, 1), (317, 1000, 1) ])
ds = dirnode.DeepStats()
for i in range(1, 1100):
ds.histogram("size-files-histogram", i)
ds.histogram("size-files-histogram", 4*1000*1000*1000*1000) # 4TB
s = ds.get_results()
self.failUnlessEqual(s["size-files-histogram"],
[ (1, 3, 3),
(4, 10, 7),
(11, 31, 21),
(32, 100, 69),
(101, 316, 216),
(317, 1000, 684),
(1001, 3162, 99),
(3162277660169L, 10000000000000L, 1),
])
netstring = hashutil.netstring
split_netstring = dirnode.split_netstring

View File

@ -1112,6 +1112,31 @@ class SystemTest(testutil.SignalMixin, testutil.PollMixin, testutil.StallMixin,
# P/s2-rw/mydata992 (same as P/s2-rw/mydata992)
d1.addCallback(lambda manifest:
self.failUnlessEqual(len(manifest), 4))
d1.addCallback(lambda res: home.deep_stats())
def _check_stats(stats):
expected = {"count-immutable-files": 1,
"count-mutable-files": 0,
"count-literal-files": 1,
"count-files": 2,
"count-directories": 3,
"size-immutable-files": 112,
"size-literal-files": 23,
#"size-directories": 616, # varies
#"largest-directory": 616,
"largest-directory-children": 3,
"largest-immutable-file": 112,
}
for k,v in expected.iteritems():
self.failUnlessEqual(stats[k], v,
"stats[%s] was %s, not %s" %
(k, stats[k], v))
self.failUnless(stats["size-directories"] > 1300,
stats["size-directories"])
self.failUnless(stats["largest-directory"] > 800,
stats["largest-directory"])
self.failUnlessEqual(stats["size-files-histogram"],
[ (11, 31, 1), (101, 316, 1) ])
d1.addCallback(_check_stats)
return d1
d.addCallback(_got_home)
return d

View File

@ -788,6 +788,8 @@ class Web(WebMixin, unittest.TestCase):
self.failUnlessEqual(stats[k], v,
"stats[%s] was %s, not %s" %
(k, stats[k], v))
self.failUnlessEqual(stats["size-files-histogram"],
[ [11, 31, 3] ])
d.addCallback(_got)
return d