deep-stats: add file-size histogram

This commit is contained in:
Brian Warner
2008-05-08 16:19:42 -07:00
parent a1ea3d9b37
commit fabdc28c06
5 changed files with 119 additions and 3 deletions

View File

@ -585,6 +585,9 @@ GET $URL?t=deep-stats
size-mutable-files (TODO): same, for current version of all mutable files size-mutable-files (TODO): same, for current version of all mutable files
size-literal-files: same, for LIT files size-literal-files: same, for LIT files
size-directories: size of directories (includes size-literal-files) size-directories: size of directories (includes size-literal-files)
size-files-histogram: list of (minsize, maxsize, count) buckets,
with a histogram of filesizes, 5dB/bucket,
for both literal and immutable files
largest-directory: number of children in the largest directory largest-directory: number of children in the largest directory
largest-immutable-file: number of bytes in the largest CHK file largest-immutable-file: number of bytes in the largest CHK file

View File

@ -1,5 +1,5 @@
import os, time import os, time, math
from zope.interface import implements from zope.interface import implements
from twisted.internet import defer from twisted.internet import defer
@ -8,7 +8,7 @@ from allmydata.mutable.common import NotMutableError
from allmydata.mutable.node import MutableFileNode from allmydata.mutable.node import MutableFileNode
from allmydata.interfaces import IMutableFileNode, IDirectoryNode,\ from allmydata.interfaces import IMutableFileNode, IDirectoryNode,\
IURI, IFileNode, IMutableFileURI, IVerifierURI, IFilesystemNode IURI, IFileNode, IMutableFileURI, IVerifierURI, IFilesystemNode
from allmydata.util import hashutil from allmydata.util import hashutil, mathutil
from allmydata.util.hashutil import netstring from allmydata.util.hashutil import netstring
from allmydata.util.limiter import ConcurrencyLimiter from allmydata.util.limiter import ConcurrencyLimiter
from allmydata.uri import NewDirectoryURI from allmydata.uri import NewDirectoryURI
@ -514,6 +514,7 @@ class NewDirectoryNode:
elif IFileNode.providedBy(child): # CHK and LIT elif IFileNode.providedBy(child): # CHK and LIT
stats.add("count-files") stats.add("count-files")
size = child.get_size() size = child.get_size()
stats.histogram("size-files-histogram", size)
if child.get_uri().startswith("URI:LIT:"): if child.get_uri().startswith("URI:LIT:"):
stats.add("count-literal-files") stats.add("count-literal-files")
stats.add("size-literal-files", size) stats.add("size-literal-files", size)
@ -544,6 +545,11 @@ class DeepStats:
#"largest-mutable-file", #"largest-mutable-file",
]: ]:
self.stats[k] = 0 self.stats[k] = 0
self.histograms = {}
for k in ["size-files-histogram"]:
self.histograms[k] = {} # maps (min,max) to count
self.buckets = [ (0,0), (1,3)]
self.root = math.sqrt(10)
def add(self, key, value=1): def add(self, key, value=1):
self.stats[key] += value self.stats[key] += value
@ -551,8 +557,38 @@ class DeepStats:
def max(self, key, value): def max(self, key, value):
self.stats[key] = max(self.stats[key], value) self.stats[key] = max(self.stats[key], value)
def which_bucket(self, size):
# return (min,max) such that min <= size <= max
# values are from the set (0,0), (1,3), (4,10), (11,31), (32,100),
# (101,316), (317, 1000), etc: two per decade
assert size >= 0
i = 0
while True:
if i >= len(self.buckets):
# extend the list
new_lower = self.buckets[i-1][1]+1
new_upper = int(mathutil.next_power_of_k(new_lower, self.root))
self.buckets.append( (new_lower, new_upper) )
maybe = self.buckets[i]
if maybe[0] <= size <= maybe[1]:
return maybe
i += 1
def histogram(self, key, size):
bucket = self.which_bucket(size)
h = self.histograms[key]
if bucket not in h:
h[bucket] = 0
h[bucket] += 1
def get_results(self): def get_results(self):
return self.stats stats = self.stats.copy()
for key in self.histograms:
h = self.histograms[key]
out = [ (bucket[0], bucket[1], h[bucket]) for bucket in h ]
out.sort()
stats[key] = out
return stats
# use client.create_dirnode() to make one of these # use client.create_dirnode() to make one of these

View File

@ -166,6 +166,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
d = self.client.create_empty_dirnode() d = self.client.create_empty_dirnode()
def _then(n): def _then(n):
# /
self.failUnless(n.is_mutable()) self.failUnless(n.is_mutable())
u = n.get_uri() u = n.get_uri()
self.failUnless(u) self.failUnless(u)
@ -186,8 +187,13 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
assert isinstance(ffu_v, str) assert isinstance(ffu_v, str)
self.expected_manifest.append(ffu_v) self.expected_manifest.append(ffu_v)
d.addCallback(lambda res: n.set_uri(u"child", fake_file_uri)) d.addCallback(lambda res: n.set_uri(u"child", fake_file_uri))
# /
# /child = mutable
d.addCallback(lambda res: n.create_empty_directory(u"subdir")) d.addCallback(lambda res: n.create_empty_directory(u"subdir"))
# /
# /child = mutable
# /subdir = directory
def _created(subdir): def _created(subdir):
self.failUnless(isinstance(subdir, FakeDirectoryNode)) self.failUnless(isinstance(subdir, FakeDirectoryNode))
self.subdir = subdir self.subdir = subdir
@ -230,6 +236,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
stats["size-directories"]) stats["size-directories"])
self.failUnless(stats["largest-directory"] > 500, self.failUnless(stats["largest-directory"] > 500,
stats["largest-directory"]) stats["largest-directory"])
self.failUnlessEqual(stats["size-files-histogram"], [])
d.addCallback(_check_deepstats) d.addCallback(_check_deepstats)
def _add_subsubdir(res): def _add_subsubdir(res):
@ -458,6 +465,49 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
return d return d
class DeepStats(unittest.TestCase):
def test_stats(self):
ds = dirnode.DeepStats()
ds.add("count-files")
ds.add("size-immutable-files", 123)
ds.histogram("size-files-histogram", 123)
ds.max("largest-directory", 444)
s = ds.get_results()
self.failUnlessEqual(s["count-files"], 1)
self.failUnlessEqual(s["size-immutable-files"], 123)
self.failUnlessEqual(s["largest-directory"], 444)
self.failUnlessEqual(s["count-literal-files"], 0)
ds.add("count-files")
ds.add("size-immutable-files", 321)
ds.histogram("size-files-histogram", 321)
ds.max("largest-directory", 2)
s = ds.get_results()
self.failUnlessEqual(s["count-files"], 2)
self.failUnlessEqual(s["size-immutable-files"], 444)
self.failUnlessEqual(s["largest-directory"], 444)
self.failUnlessEqual(s["count-literal-files"], 0)
self.failUnlessEqual(s["size-files-histogram"],
[ (101, 316, 1), (317, 1000, 1) ])
ds = dirnode.DeepStats()
for i in range(1, 1100):
ds.histogram("size-files-histogram", i)
ds.histogram("size-files-histogram", 4*1000*1000*1000*1000) # 4TB
s = ds.get_results()
self.failUnlessEqual(s["size-files-histogram"],
[ (1, 3, 3),
(4, 10, 7),
(11, 31, 21),
(32, 100, 69),
(101, 316, 216),
(317, 1000, 684),
(1001, 3162, 99),
(3162277660169L, 10000000000000L, 1),
])
netstring = hashutil.netstring netstring = hashutil.netstring
split_netstring = dirnode.split_netstring split_netstring = dirnode.split_netstring

View File

@ -1112,6 +1112,31 @@ class SystemTest(testutil.SignalMixin, testutil.PollMixin, testutil.StallMixin,
# P/s2-rw/mydata992 (same as P/s2-rw/mydata992) # P/s2-rw/mydata992 (same as P/s2-rw/mydata992)
d1.addCallback(lambda manifest: d1.addCallback(lambda manifest:
self.failUnlessEqual(len(manifest), 4)) self.failUnlessEqual(len(manifest), 4))
d1.addCallback(lambda res: home.deep_stats())
def _check_stats(stats):
expected = {"count-immutable-files": 1,
"count-mutable-files": 0,
"count-literal-files": 1,
"count-files": 2,
"count-directories": 3,
"size-immutable-files": 112,
"size-literal-files": 23,
#"size-directories": 616, # varies
#"largest-directory": 616,
"largest-directory-children": 3,
"largest-immutable-file": 112,
}
for k,v in expected.iteritems():
self.failUnlessEqual(stats[k], v,
"stats[%s] was %s, not %s" %
(k, stats[k], v))
self.failUnless(stats["size-directories"] > 1300,
stats["size-directories"])
self.failUnless(stats["largest-directory"] > 800,
stats["largest-directory"])
self.failUnlessEqual(stats["size-files-histogram"],
[ (11, 31, 1), (101, 316, 1) ])
d1.addCallback(_check_stats)
return d1 return d1
d.addCallback(_got_home) d.addCallback(_got_home)
return d return d

View File

@ -788,6 +788,8 @@ class Web(WebMixin, unittest.TestCase):
self.failUnlessEqual(stats[k], v, self.failUnlessEqual(stats[k], v,
"stats[%s] was %s, not %s" % "stats[%s] was %s, not %s" %
(k, stats[k], v)) (k, stats[k], v))
self.failUnlessEqual(stats["size-files-histogram"],
[ [11, 31, 3] ])
d.addCallback(_got) d.addCallback(_got)
return d return d