crawler: add get_progress, clean up get_state

This commit is contained in:
Brian Warner 2009-02-20 18:27:43 -07:00
parent 2e45619844
commit 73e05bf967
2 changed files with 76 additions and 8 deletions

View File

@ -75,12 +75,43 @@ class ShareCrawler(service.MultiService):
self.current_sleep_time = None
self.next_wake_time = None
def get_progress(self):
"""I return information about how much progress the crawler is
making. My return value is a dictionary. The primary key is
'cycle-in-progress': True if the crawler is currently traversing the
shares, False if it is idle between cycles.
If cycle-in-progress is True, the following keys will be present::
cycle-complete-percentage': float, from 0.0 to 100.0, indicating how
far the crawler has progressed through
the current cycle
remaining-sleep-time: float, seconds from now when we do more work
If cycle-in-progress is False, the following keys are available::
next-crawl-time: float, seconds-since-epoch when next crawl starts
remaining-wait-time: float, seconds from now when next crawl starts
"""
d = {}
if self.state["current-cycle"] is None:
assert self.sleeping_between_cycles
d["cycle-in-progress"] = False
d["next-crawl-time"] = self.next_wake_time
d["remaining-wait-time"] = self.next_wake_time - time.time()
else:
d["cycle-in-progress"] = True
pct = 100.0 * self.last_complete_prefix_index / len(self.prefixes)
d["cycle-complete-percentage"] = pct
d["remaining-sleep-time"] = self.next_wake_time - time.time()
return d
def get_state(self):
"""I return the current state of the crawler. This is a copy of my
state dictionary, plus the following keys::
current-sleep-time: float, duration of our current sleep
next-wake-time: float, seconds-since-epoch of when we will next wake
state dictionary.
If we are not currently sleeping (i.e. get_state() was called from
inside the process_prefixdir, process_bucket, or finished_cycle()
@ -88,8 +119,6 @@ class ShareCrawler(service.MultiService):
these two keys will be None.
"""
state = self.state.copy() # it isn't a deepcopy, so don't go crazy
state["current-sleep-time"] = self.current_sleep_time
state["next-wake-time"] = self.next_wake_time
return state
def load_state(self):

View File

@ -31,6 +31,7 @@ class PacedCrawler(ShareCrawler):
self.countdown = 6
self.all_buckets = []
self.finished_d = defer.Deferred()
self.yield_cb = None
def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32):
self.all_buckets.append(storage_index_b32)
self.countdown -= 1
@ -39,6 +40,8 @@ class PacedCrawler(ShareCrawler):
self.cpu_slice = -1.0
def yielding(self, sleep_time):
self.cpu_slice = 500
if self.yield_cb:
self.yield_cb()
def finished_cycle(self, cycle):
eventual.eventually(self.finished_d.callback, None)
@ -173,6 +176,7 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
# that should stop in the middle of one of the buckets.
c.cpu_slice = PacedCrawler.cpu_slice
self.failUnlessEqual(len(c.all_buckets), 6)
c.start_current_prefix(time.time()) # finish it
self.failUnlessEqual(len(sis), len(c.all_buckets))
self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
@ -252,18 +256,53 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
statefile = os.path.join(self.basedir, "statefile")
c = PacedCrawler(ss, statefile)
did_check_progress = [False]
def check_progress():
c.yield_cb = None
try:
p = c.get_progress()
self.failUnlessEqual(p["cycle-in-progress"], True)
pct = p["cycle-complete-percentage"]
# after 6 buckets, we happen to be at 76.17% complete. As
# long as we create shares in deterministic order, this will
# continue to be true.
self.failUnlessEqual(int(pct), 76)
left = p["remaining-sleep-time"]
self.failUnless(isinstance(left, float), left)
self.failUnless(left > 0.0, left)
except Exception, e:
did_check_progress[0] = e
else:
did_check_progress[0] = True
c.yield_cb = check_progress
c.setServiceParent(self.s)
# that should get through 6 buckets, pause for a little while, then
# resume
# that should get through 6 buckets, pause for a little while (and
# run check_progress()), then resume
d = c.finished_d
def _check(ignored):
if did_check_progress[0] is not True:
raise did_check_progress[0]
self.failUnless(did_check_progress[0])
self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
# at this point, the crawler should be sitting in the inter-cycle
# timer, which should be pegged at the minumum cycle time
self.failUnless(c.timer)
self.failUnless(c.sleeping_between_cycles)
self.failUnlessEqual(c.current_sleep_time, c.minimum_cycle_time)
p = c.get_progress()
self.failUnlessEqual(p["cycle-in-progress"], False)
naptime = p["remaining-wait-time"]
self.failUnless(isinstance(naptime, float), naptime)
# min-cycle-time is 300, so this is basically testing that it took
# less than 290s to crawl
self.failUnless(naptime > 10.0, naptime)
soon = p["next-crawl-time"] - time.time()
self.failUnless(soon > 10.0, soon)
d.addCallback(_check)
return d