From 46955202e28091b9fbc87e22f8c3055c16b9c42b Mon Sep 17 00:00:00 2001 From: Jean-Paul Calderone Date: Wed, 18 Nov 2020 15:47:06 -0500 Subject: [PATCH] Hook into Twisted Web to control where request bodies are written --- src/allmydata/client.py | 24 +++++- src/allmydata/test/web/test_web.py | 10 ++- src/allmydata/test/web/test_webish.py | 104 +++++++++++++++++++++++++- src/allmydata/webish.py | 45 +++++++---- 4 files changed, 163 insertions(+), 20 deletions(-) diff --git a/src/allmydata/client.py b/src/allmydata/client.py index a768ba354..88a74e186 100644 --- a/src/allmydata/client.py +++ b/src/allmydata/client.py @@ -33,6 +33,7 @@ from allmydata.introducer.client import IntroducerClient from allmydata.util import ( hashutil, base32, pollmixin, log, idlib, yamlutil, configutil, + fileutil, ) from allmydata.util.encodingutil import get_filesystem_encoding from allmydata.util.abbreviate import parse_abbreviated_size @@ -1043,6 +1044,21 @@ class _Client(node.Node, pollmixin.PollMixin): def set_default_mutable_keysize(self, keysize): self._key_generator.set_default_keysize(keysize) + def _get_tempdir(self): + """ + Determine the path to the directory where temporary files for this node + should be written. + + :return bytes: The path which will exist and be a directory. + """ + tempdir_config = self.config.get_config("node", "tempdir", "tmp") + if isinstance(tempdir_config, bytes): + tempdir_config = tempdir_config.decode('utf-8') + tempdir = self.config.get_config_path(tempdir_config) + if not os.path.exists(tempdir): + fileutil.make_dirs(tempdir) + return tempdir + def init_web(self, webport): self.log("init_web(webport=%s)", args=(webport,)) @@ -1050,7 +1066,13 @@ class _Client(node.Node, pollmixin.PollMixin): nodeurl_path = self.config.get_config_path("node.url") staticdir_config = self.config.get_config("node", "web.static", "public_html") staticdir = self.config.get_config_path(staticdir_config) - ws = WebishServer(self, webport, nodeurl_path, staticdir) + ws = WebishServer( + self, + webport, + self._get_tempdir(), + nodeurl_path, + staticdir, + ) ws.setServiceParent(self) def init_ftp_server(self): diff --git a/src/allmydata/test/web/test_web.py b/src/allmydata/test/web/test_web.py index aa6d44ea4..27611e678 100644 --- a/src/allmydata/test/web/test_web.py +++ b/src/allmydata/test/web/test_web.py @@ -316,8 +316,14 @@ class WebMixin(TimezoneMixin): self.staticdir = self.mktemp() self.clock = Clock() self.fakeTime = 86460 # 1d 0h 1m 0s - self.ws = webish.WebishServer(self.s, "0", staticdir=self.staticdir, - clock=self.clock, now_fn=lambda:self.fakeTime) + self.ws = webish.WebishServer( + self.s, + "0", + tempdir=self.mktemp(), + staticdir=self.staticdir, + clock=self.clock, + now_fn=lambda:self.fakeTime, + ) self.ws.setServiceParent(self.s) self.webish_port = self.ws.getPortnum() self.webish_url = self.ws.getURL() diff --git a/src/allmydata/test/web/test_webish.py b/src/allmydata/test/web/test_webish.py index 1e659812f..67bfca69e 100644 --- a/src/allmydata/test/web/test_webish.py +++ b/src/allmydata/test/web/test_webish.py @@ -5,6 +5,19 @@ Tests for ``allmydata.webish``. from uuid import ( uuid4, ) +from errno import ( + EACCES, +) +from io import ( + BytesIO, +) + +from hypothesis import ( + given, +) +from hypothesis.strategies import ( + integers, +) from testtools.matchers import ( AfterPreprocessing, @@ -12,6 +25,7 @@ from testtools.matchers import ( Equals, MatchesAll, Not, + IsInstance, ) from twisted.python.filepath import ( @@ -30,7 +44,7 @@ from ..common import ( from ...webish import ( TahoeLAFSRequest, - tahoe_lafs_site, + TahoeLAFSSite, ) @@ -96,7 +110,7 @@ class TahoeLAFSRequestTests(SyncTestCase): class TahoeLAFSSiteTests(SyncTestCase): """ - Tests for the ``Site`` created by ``tahoe_lafs_site``. + Tests for ``TahoeLAFSSite``. """ def _test_censoring(self, path, censored): """ @@ -112,7 +126,7 @@ class TahoeLAFSSiteTests(SyncTestCase): """ logPath = self.mktemp() - site = tahoe_lafs_site(Resource(), logPath=logPath) + site = TahoeLAFSSite(self.mktemp(), Resource(), logPath=logPath) site.startFactory() channel = DummyChannel() @@ -170,6 +184,90 @@ class TahoeLAFSSiteTests(SyncTestCase): b"/uri?uri=[CENSORED]", ) + def _get_request(self, tempdir): + """ + Create and return a new ``TahoeLAFSRequest`` hooked up to a + ``TahoeLAFSSite``. + + :param bytes tempdir: The temporary directory to give to the site. + + :return TahoeLAFSRequest: The new request instance. + """ + site = TahoeLAFSSite(tempdir.path, Resource(), logPath=self.mktemp()) + site.startFactory() + + channel = DummyChannel() + channel.site = site + request = TahoeLAFSRequest(channel) + return request + + @given(integers(min_value=0, max_value=1024 * 1024 - 1)) + def test_small_content(self, request_body_size): + """ + A request body smaller than 1 MiB is kept in memory. + """ + tempdir = FilePath(self.mktemp()) + request = self._get_request(tempdir) + request.gotLength(request_body_size) + self.assertThat( + request.content, + IsInstance(BytesIO), + ) + + def _large_request_test(self, request_body_size): + """ + Assert that when a request with a body of of the given size is received + its content is written to the directory the ``TahoeLAFSSite`` is + configured with. + """ + tempdir = FilePath(self.mktemp()) + tempdir.makedirs() + request = self._get_request(tempdir) + + # So. Bad news. The temporary file for the uploaded content is + # unnamed (and this isn't even necessarily a bad thing since it is how + # you get automatic on-process-exit cleanup behavior on POSIX). It's + # not visible by inspecting the filesystem. It has no name we can + # discover. Then how do we verify it is written to the right place? + # The question itself is meaningless if we try to be too precise. It + # *has* no filesystem location. However, it is still stored *on* some + # filesystem. We still want to make sure it is on the filesystem we + # specified because otherwise it might be on a filesystem that's too + # small or undesirable in some other way. + # + # I don't know of any way to ask a file descriptor which filesystem + # it's on, either, though. It might be the case that the [f]statvfs() + # result could be compared somehow to infer the filesystem but + # ... it's not clear what the failure modes might be there, across + # different filesystems and runtime environments. + # + # Another approach is to make the temp directory unwriteable and + # observe the failure when an attempt is made to create a file there. + # This is hardly a lovely solution but at least it's kind of simple. + tempdir.chmod(0o550) + with self.assertRaises(OSError) as ctx: + request.gotLength(request_body_size) + self.assertThat( + ctx.exception.errno, + Equals(EACCES), + ) + + def test_unknown_request_size(self): + """ + A request body with an unknown size is written to a file in the temporary + directory passed to ``TahoeLAFSSite``. + """ + self._large_request_test(None) + + @given(integers(min_value=1024 * 1024)) + def test_large_request(self, request_body_size): + """ + A request body of 1 MiB or more is written to a file in the temporary + directory passed to ``TahoeLAFSSite``. + """ + self._large_request_test(request_body_size) + + def param(name, value): return u"; {}={}".format(name, value) diff --git a/src/allmydata/webish.py b/src/allmydata/webish.py index f94d6f7da..779458a59 100644 --- a/src/allmydata/webish.py +++ b/src/allmydata/webish.py @@ -1,13 +1,13 @@ from six import ensure_str -import re, time +import re, time, tempfile -from functools import ( - partial, -) from cgi import ( FieldStorage, ) +from io import ( + BytesIO, +) from twisted.application import service, strports, internet from twisted.web import static @@ -150,17 +150,34 @@ def _logFormatter(logDateTime, request): ) -tahoe_lafs_site = partial( - Site, - requestFactory=TahoeLAFSRequest, - logFormatter=_logFormatter, -) +class TahoeLAFSSite(Site): + """ + The HTTP protocol factory used by Tahoe-LAFS. + + Among the behaviors provided: + + * A configurable temporary directory where large request bodies can be + written so they don't stay in memory. + + * A log formatter that writes some access logs but omits capability + strings to help keep them secret. + """ + requestFactory = TahoeLAFSRequest + + def __init__(self, tempdir, *args, **kwargs): + Site.__init__(self, *args, logFormatter=_logFormatter, **kwargs) + self._tempdir = tempdir + + def getContentFile(self, length): + if length is None or length >= 1024 * 1024: + return tempfile.TemporaryFile(dir=self._tempdir) + return BytesIO() class WebishServer(service.MultiService): name = "webish" - def __init__(self, client, webport, nodeurl_path=None, staticdir=None, + def __init__(self, client, webport, tempdir, nodeurl_path=None, staticdir=None, clock=None, now_fn=time.time): service.MultiService.__init__(self) # the 'data' argument to all render() methods default to the Client @@ -170,7 +187,7 @@ class WebishServer(service.MultiService): # time in a deterministic manner. self.root = root.Root(client, clock, now_fn) - self.buildServer(webport, nodeurl_path, staticdir) + self.buildServer(webport, tempdir, nodeurl_path, staticdir) # If set, clock is a twisted.internet.task.Clock that the tests # use to test ophandle expiration. @@ -180,9 +197,9 @@ class WebishServer(service.MultiService): self.root.putChild(b"storage-plugins", StoragePlugins(client)) - def buildServer(self, webport, nodeurl_path, staticdir): + def buildServer(self, webport, tempdir, nodeurl_path, staticdir): self.webport = webport - self.site = tahoe_lafs_site(self.root) + self.site = TahoeLAFSSite(tempdir, self.root) self.staticdir = staticdir # so tests can check if staticdir: self.root.putChild("static", static.File(staticdir)) @@ -260,4 +277,4 @@ class IntroducerWebishServer(WebishServer): def __init__(self, introducer, webport, nodeurl_path=None, staticdir=None): service.MultiService.__init__(self) self.root = introweb.IntroducerRoot(introducer) - self.buildServer(webport, nodeurl_path, staticdir) + self.buildServer(webport, tempfile.tempdir, nodeurl_path, staticdir)