download.py: use producer/consumer to reduce memory usage, closes #129.

If the DownloadTarget is also an IConsumer, give it control of the brakes by offering ourselves to target.registerProducer(). When they tell us to pause, set a flag, which is checked between segment downloads and decodes. webish.py: make WebDownloadTarget an IConsumer and pass control along to the http.Request, which already knows how to be an IConsumer. This reduces the memory footprint of stalled HTTP GETs to a bare minimum, and thus closes #129.
2024-12-20 05:28:04 +00:00 · 2007-09-19 00:34:47 -07:00 · 2007-09-19 00:34:47 -07:00 · 1340c484c6
commit 1340c484c6
parent 9c9a793540
2 changed files with 68 additions and 6 deletions
--- a/src/allmydata/download.py
+++ b/src/allmydata/download.py
@ -3,7 +3,9 @@ import os, random
 from zope.interface import implements
 from twisted.python import log
 from twisted.internet import defer
 from twisted.internet.interfaces import IPushProducer, IConsumer
 from twisted.application import service
 from foolscap.eventual import eventually
 from allmydata.util import idlib, mathutil, hashutil
 from allmydata.util.assertutil import _assert
@ -23,6 +25,9 @@ class BadPlaintextHashValue(Exception):
 class BadCrypttextHashValue(Exception):
    pass
 class DownloadStopped(Exception):
    pass
 class Output:
    def __init__(self, downloadable, key, total_length):
        self.downloadable = downloadable
@ -282,6 +287,7 @@ class SegmentDownloader:
        self.parent.bucket_failed(vbucket)
 class FileDownloader:
    implements(IPushProducer)
    check_crypttext_hash = True
    check_plaintext_hash = True
@ -295,7 +301,12 @@ class FileDownloader:
        self._size = u.size
        self._num_needed_shares = u.needed_shares
        if IConsumer.providedBy(downloadable):
            downloadable.registerProducer(self, True)
        self._downloadable = downloadable
        self._output = Output(downloadable, u.key, self._size)
        self._paused = False
        self._stopped = False
        self.active_buckets = {} # k: shnum, v: bucket
        self._share_buckets = [] # list of (sharenum, bucket) tuples
@ -311,8 +322,23 @@ class FileDownloader:
                                "crypttext_hashtree": 0,
                                }
    def pauseProducing(self):
        if self._paused:
            return
        self._paused = defer.Deferred()
    def resumeProducing(self):
        if self._paused:
            p = self._paused
            self._paused = None
            eventually(p.callback, None)
    def stopProducing(self):
        log.msg("Download.stopProducing")
        self._stopped = True
    def start(self):
-        log.msg("starting download [%s]" % idlib.b2a(self._storage_index))
+        log.msg("starting download [%s]" % idlib.b2a(self._storage_index)[:6])
        # first step: who should we download from?
        d = defer.maybeDeferred(self._get_all_shareholders)
@ -324,6 +350,11 @@ class FileDownloader:
        d.addCallback(self._create_validated_buckets)
        # once we know that, we can download blocks from everybody
        d.addCallback(self._download_all_segments)
        def _finished(res):
            if IConsumer.providedBy(self._downloadable):
                self._downloadable.unregisterProducer()
            return res
        d.addBoth(_finished)
        def _failed(why):
            self._output.fail(why)
            return why
@ -541,20 +572,40 @@ class FileDownloader:
        d = defer.succeed(None)
        for segnum in range(self._total_segments-1):
            d.addCallback(self._download_segment, segnum)
            # this pause, at the end of write, prevents pre-fetch from
            # happening until the consumer is ready for more data.
            d.addCallback(self._check_for_pause)
        d.addCallback(self._download_tail_segment, self._total_segments-1)
        return d
    def _check_for_pause(self, res):
        if self._paused:
            d = defer.Deferred()
            self._paused.addCallback(lambda ignored: d.callback(res))
            return d
        if self._stopped:
            raise DownloadStopped("our Consumer called stopProducing()")
        return res
    def _download_segment(self, res, segnum):
        log.msg("downloading seg#%d of %d (%d%%)"
                % (segnum, self._total_segments,
                   100.0 * segnum / self._total_segments))
        # memory footprint: when the SegmentDownloader finishes pulling down
        # all shares, we have 1*segment_size of usage.
        segmentdler = SegmentDownloader(self, segnum, self._num_needed_shares)
        d = segmentdler.start()
        # pause before using more memory
        d.addCallback(self._check_for_pause)
        # while the codec does its job, we hit 2*segment_size
        d.addCallback(lambda (shares, shareids):
                      self._codec.decode(shares, shareids))
        # once the codec is done, we drop back to 1*segment_size, because
        # 'shares' goes out of scope. The memory usage is all in the
        # plaintext now, spread out into a bunch of tiny buffers.
        # pause/check-for-stop just before writing, to honor stopProducing
        d.addCallback(self._check_for_pause)
        def _done(buffers):
            # we start by joining all these buffers together into a single
            # string. This makes Output.write easier, since it wants to hash
@ -571,10 +622,17 @@ class FileDownloader:
        return d
    def _download_tail_segment(self, res, segnum):
        log.msg("downloading seg#%d of %d (%d%%)"
                % (segnum, self._total_segments,
                   100.0 * segnum / self._total_segments))
        segmentdler = SegmentDownloader(self, segnum, self._num_needed_shares)
        d = segmentdler.start()
        # pause before using more memory
        d.addCallback(self._check_for_pause)
        d.addCallback(lambda (shares, shareids):
                      self._tail_codec.decode(shares, shareids))
        # pause/check-for-stop just before writing, to honor stopProducing
        d.addCallback(self._check_for_pause)
        def _done(buffers):
            # trim off any padding added by the upload side
            segment = "".join(buffers)
@ -589,11 +647,8 @@ class FileDownloader:
        return d
    def _done(self, res):
        log.msg("download done [%s]" % idlib.b2a(self._storage_index)[:6])
        self._output.close()
        log.msg("computed CRYPTTEXT_HASH: %s" %
                idlib.b2a(self._output.crypttext_hash))
        log.msg("computed PLAINTEXT_HASH: %s" %
                idlib.b2a(self._output.plaintext_hash))
        if self.check_crypttext_hash:
            _assert(self._crypttext_hash == self._output.crypttext_hash,
                    "bad crypttext_hash: computed=%s, expected=%s" %
--- a/src/allmydata/webish.py
+++ b/src/allmydata/webish.py
@ -5,6 +5,7 @@ from twisted.application import service, strports, internet
 from twisted.web import static, resource, server, html, http
 from twisted.python import util, log
 from twisted.internet import defer
 from twisted.internet.interfaces import IConsumer
 from nevow import inevow, rend, loaders, appserver, url, tags as T
 from nevow.static import File as nevow_File # TODO: merge with static.File?
 from allmydata.util import fileutil
@ -271,12 +272,18 @@ class Directory(rend.Page):
            return ""
 class WebDownloadTarget:
-    implements(IDownloadTarget)
+    implements(IDownloadTarget, IConsumer)
    def __init__(self, req, content_type, content_encoding):
        self._req = req
        self._content_type = content_type
        self._content_encoding = content_encoding
        self._opened = False
        self._producer = None
    def registerProducer(self, producer, streaming):
        self._req.registerProducer(producer, streaming)
    def unregisterProducer(self):
        self._req.unregisterProducer()
    def open(self, size):
        self._opened = True