#598: add cli+backupdb tests, improve user display, update docs, move docs out of proposed/

This commit is contained in:
Brian Warner 2009-02-05 22:07:01 -07:00
parent add0e1f2f8
commit ca32db5b39
4 changed files with 391 additions and 293 deletions

160
docs/backupdb.txt Normal file
View File

@ -0,0 +1,160 @@
= The Tahoe BackupDB =
To speed up backup operations, Tahoe maintains a small database known as the
"backupdb". This is used to avoid re-uploading files which have already been
uploaded recently.
This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
single-file database. It is used by the "tahoe backup" command (unless the
--no-backupdb option is included). In the future, it will also be used by
"tahoe mirror", and by "tahoe cp" when the --use-backupdb option is included.
The purpose of this database is specifically to manage the file-to-cap
translation (the "upload" step). It does not address directory updates. A
future version will include a directory cache.
The overall goal of optimizing backup is to reduce the work required when the
source disk has not changed since the last backup. In the ideal case, running
"tahoe backup" twice in a row, with no intervening changes to the disk, will
not require any network traffic.
This database is optional. If it is deleted, the worst effect is that a
subsequent backup operation may use more effort (network bandwidth, CPU
cycles, and disk IO) than it would have without the backupdb.
The database uses sqlite3, which is included as part of the standard python
library with python2.5 and later. For python2.4, please install the
"pysqlite2" package (which, despite the name, actually provides sqlite3
rather than sqlite2).
== Schema ==
The database contains the following tables:
CREATE TABLE version
(
version integer # contains one row, set to 1
);
CREATE TABLE last_upload
(
path varchar(1024), PRIMARY KEY -- index, this is os.path.abspath(fn)
size integer, -- os.stat(fn)[stat.ST_SIZE]
mtime number, -- os.stat(fn)[stat.ST_MTIME]
ctime number, -- os.stat(fn)[stat.ST_MTIME]
fileid integer
);
CREATE TABLE caps
(
fileid integer PRIMARY KEY AUTOINCREMENT,
filecap varchar(256) UNIQUE -- URI:CHK:...
);
CREATE TABLE last_upload
(
fileid INTEGER PRIMARY KEY,
last_uploaded TIMESTAMP,
last_checked TIMESTAMP
);
Notes: if we extend the backupdb to assist with directory maintenance (see
below), we may need paths in multiple places, so it would make sense to
create a table for them, and change the last_upload table to refer to a
pathid instead of an absolute path:
CREATE TABLE paths
(
path varchar(1024) UNIQUE, -- index
pathid integer PRIMARY KEY AUTOINCREMENT
);
== Operation ==
The upload process starts with a pathname (like ~/.emacs) and wants to end up
with a file-cap (like URI:CHK:...).
The first step is to convert the path to an absolute form
(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
not present in this table, the file must be uploaded. The upload process is:
1. record the file's size, creation time, and modification time
2. upload the file into the grid, obtaining an immutable file read-cap
3. add an entry to the 'caps' table, with the read-cap, to get a fileid
4. add an entry to the 'last_upload' table, with the current time
5. add an entry to the 'local_files' table, with the fileid, the path,
and the local file's size/ctime/mtime
If the path *is* present in 'last_upload', the easy-to-compute identifying
information is compared: file size and ctime/mtime. If these differ, the file
must be uploaded. The row is removed from the last_upload table, and the
upload process above is followed.
If the path is present but ctime or mtime differs, the file may have changed.
If the size differs, then the file has certainly changed. At this point, a
future version of the "backup" command might hash the file and look for a
match in an as-yet-defined table, in the hopes that the file has simply been
moved from somewhere else on the disk. This enhancement requires changes to
the Tahoe upload API before it can be significantly more efficient than
simply handing the file to Tahoe and relying upon the normal convergence to
notice the similarity.
If ctime, mtime, or size is different, the client will upload the file, as
above.
If these identifiers are the same, the client will assume that the file is
unchanged (unless the --ignore-timestamps option is provided, in which case
the client always re-uploads the file), and it may be allowed to skip the
upload. For safety, however, we require the client periodically perform a
filecheck on these probably-already-uploaded files, and re-upload anything
that doesn't look healthy. The client looks the fileid up in the
'last_upload' table, to see how long it has been since the file was last
checked.
A "random early check" algorithm should be used, in which a check is
performed with a probability that increases with the age of the previous
results. E.g. files that were last checked within a month are not checked,
files that were checked 5 weeks ago are re-checked with 25% probability, 6
weeks with 50%, more than 8 weeks are always checked. This reduces the
"thundering herd" of filechecks-on-everything that would otherwise result
when a backup operation is run one month after the original backup. If a
filecheck reveals the file is not healthy, it is re-uploaded.
If the filecheck shows the file is healthy, or if the filecheck was skipped,
the client gets to skip the upload, and uses the previous filecap (from the
'caps' table) to add to the parent directory.
If a new file is uploaded, a new entry is put in the 'caps' and 'last_upload'
table, and an entry is made in the 'local_files' table to reflect the mapping
from local disk pathname to uploaded filecap. If an old file is re-uploaded,
the 'last_upload' entry is updated with the new timestamps. If an old file is
checked and found healthy, the 'last_upload' entry is updated.
Relying upon timestamps is a compromise between efficiency and safety: a file
which is modified without changing the timestamp or size will be treated as
unmodified, and the "tahoe backup" command will not copy the new contents
into the grid. The --no-timestamps can be used to disable this optimization,
forcing every byte of the file to be hashed and encoded.
== DIRECTORY CACHING ==
A future version of the backupdb will also record a secure hash of the most
recent contents of each tahoe directory that was used in the last backup run.
The directories created by the "tahoe backup" command are all read-only, so
it should be difficult to violate the assumption that these directories are
unmodified since the previous pass. In the future, Tahoe will provide truly
immutable directories, making this assumption even more solid.
In the current implementation, when the backup algorithm is faced with the
decision to either create a new directory or share an old one, it must read
the contents of the old directory to compare it against the desired new
contents. This means that a "null backup" (performing a backup when nothing
has been changed) must still read every Tahoe directory from the previous
backup.
With a directory-caching backupdb, these directory reads will be bypassed,
and the null backup will use minimal network bandwidth: one directory read
and two modifies. The Archives/ directory must be read to locate the latest
backup, and must be modified to add a new snapshot, and the Latest/ directory
will be updated to point to that same snapshot.

View File

@ -1,188 +0,0 @@
= PRELIMINARY =
This document is a description of a feature which is not yet implemented,
added here to solicit feedback and to describe future plans. This document is
subject to revision or withdrawal at any moment. Until this notice is
removed, consider this entire document to be a figment of your imagination.
= The Tahoe BackupDB =
To speed up backup operations, Tahoe maintains a small database known as the
"backupdb". This is used to avoid re-uploading files which have already been
uploaded recently.
This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
single-file database. It is used by the "tahoe backup" command, and by the
"tahoe cp" command when the --use-backupdb option is included.
The purpose of this database is specifically to manage the file-to-cap
translation (the "upload" step). It does not address directory updates.
The overall goal of optimizing backup is to reduce the work required when the
source disk has not changed since the last backup. In the ideal case, running
"tahoe backup" twice in a row, with no intervening changes to the disk, will
not require any network traffic.
This database is optional. If it is deleted, the worst effect is that a
subsequent backup operation may use more effort (network bandwidth, CPU
cycles, and disk IO) than it would have without the backupdb.
== Schema ==
The database contains the following tables:
CREATE TABLE version
(
version integer # contains one row, set to 0
);
CREATE TABLE last_upload
(
path varchar(1024), # index, this is os.path.abspath(fn)
size integer, # os.stat(fn)[stat.ST_SIZE]
mtime number, # os.stat(fn)[stat.ST_MTIME]
fileid integer
);
CREATE TABLE caps
(
fileid integer PRIMARY KEY AUTOINCREMENT,
filecap varchar(256), # URI:CHK:...
last_uploaded timestamp,
last_checked timestamp
);
CREATE TABLE keys_to_files
(
readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap
fileid integer
);
Notes: if we extend the backupdb to assist with directory maintenance (see
below), we may need paths in multiple places, so it would make sense to
create a table for them, and change the last_upload table to refer to a
pathid instead of an absolute path:
CREATE TABLE paths
(
path varchar(1024), # index
pathid integer PRIMARY KEY AUTOINCREMENT
);
== Operation ==
The upload process starts with a pathname (like ~/.emacs) and wants to end up
with a file-cap (like URI:CHK:...).
The first step is to convert the path to an absolute form
(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
not present in this table, the file must be uploaded. The upload process is:
1. record the file's size and modification time
2. upload the file into the grid, obtaining an immutable file read-cap
3. add an entry to the 'caps' table, with the read-cap, and the current time
4. extract the read-key from the read-cap, add an entry to 'keys_to_files'
5. add an entry to 'last_upload'
If the path *is* present in 'last_upload', the easy-to-compute identifying
information is compared: file size and modification time. If these differ,
the file must be uploaded. The row is removed from the last_upload table, and
the upload process above is followed.
If the path is present but the mtime differs, the file may have changed. If
the size differs, then the file has certainly changed. The client will
compute the CHK read-key for the file by hashing its contents, using exactly
the same algorithm as the node does when it uploads a file (including
~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to
see if this file has been uploaded before: perhaps the file was moved from
elsewhere on the disk. If no match is found, the file must be uploaded, so
the upload process above is follwed.
If the read-key *is* found in the 'keys_to_files' table, then the file has
been uploaded before, but we should consider performing a file check / verify
operation to make sure we can skip a new upload. The fileid is used to
retrieve the entry from the 'caps' table, and the last_checked timestamp is
examined. If this timestamp is too old, a filecheck operation should be
performed, and the file repaired if the results are not satisfactory. A
"random early check" algorithm should be used, in which a check is performed
with a probability that increases with the age of the previous results. E.g.
files that were last checked within a month are not checked, files that were
checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%,
more than 8 weeks are always checked. This reduces the "thundering herd" of
filechecks-on-everything that would otherwise result when a backup operation
is run one month after the original backup. The readkey can be submitted to
the upload operation, to remove a duplicate hashing pass through the file and
reduce the disk IO. In a future version of the storage server protocol, this
could also improve the "streamingness" of the upload process.
If the file's size and mtime match, the file is considered to be unmodified,
and the last_checked timestamp from the 'caps' table is examined as above
(possibly resulting in a filecheck or repair). The --no-timestamps option
disables this check: this removes the danger of false-positives (i.e. not
uploading a new file, because it appeared to be the same as a previously
uploaded one), but increases the amount of disk IO that must be performed
(every byte of every file must be hashed to compute the readkey).
This algorithm is summarized in the following pseudocode:
{{{
def backup(path):
abspath = os.path.abspath(path)
result = check_for_upload(abspath)
now = time.time()
if result == MUST_UPLOAD:
filecap = upload(abspath, key=result.readkey)
fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)",
(filecap, now, now))
db("INSERT INTO keys_to_files", (result.readkey, filecap))
db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid))
if result in (MOVED, ALREADY_UPLOADED):
age = now - result.last_checked
probability = (age - 1*MONTH) / 1*MONTH
probability = min(max(probability, 0.0), 1.0)
if random.random() < probability:
do_filecheck(result.filecap)
if result == MOVED:
db("INSERT INTO last_upload",
(abspath, current_size, current_mtime, result.fileid))
def check_for_upload(abspath):
row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s"
% abspath)
if not row:
return check_moved(abspath)
current_size = os.stat(abspath)[stat.ST_SIZE]
current_mtime = os.stat(abspath)[stat.ST_MTIME]
(last_size,last_mtime,last_fileid) = row
if file_changed(current_size, last_size, current_mtime, last_mtime):
db("DELETE FROM last_upload WHERE fileid=%s" % fileid)
return check_moved(abspath)
(filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
" WHERE fileid == %s" % last_fileid)
return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked)
def file_changed(current_size, last_size, current_mtime, last_mtime):
if last_size != current_size:
return True
if NO_TIMESTAMPS:
return True
if last_mtime != current_mtime:
return True
return False
def check_moved(abspath):
readkey = hash_with_convergence(abspath)
fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey)
if not fileid:
return MUST_UPLOAD(readkey=readkey)
(filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
" WHERE fileid == %s" % fileid)
return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked)
def do_filecheck(filecap):
health = check(filecap)
if health < DESIRED:
repair(filecap)
}}}

View File

@ -50,24 +50,6 @@ def parse_old_timestamp(s, options):
print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s
return None
def readdir(dircap, options):
# returns a dict of (childname: (type, readcap, metadata)), or None if the
# dircap didn't point to a directory
url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
resp = do_http("GET", url)
if resp.status != 200:
raiseHTTPError("Error during directory GET", resp)
jd = simplejson.load(resp)
ntype, ndata = jd
if ntype != "dirnode":
return None
contents = {}
for (childname, (childtype, childdata)) in ndata["children"].items():
contents[childname] = (childtype,
str(childdata["ro_uri"]),
childdata["metadata"])
return contents
def get_local_metadata(path):
metadata = {}
@ -131,100 +113,120 @@ def directory_is_changed(a, b):
return True
return False
def backup(options):
nodeurl = options['node-url']
from_dir = options.from_dir
to_dir = options.to_dir
if options['quiet']:
verbosity = 0
else:
verbosity = 2
stdin = options.stdin
stdout = options.stdout
stderr = options.stderr
class BackerUpper:
def __init__(self, options):
self.options = options
self.files_uploaded = 0
self.files_reused = 0
self.files_checked = 0
self.directories_read = 0
self.directories_created = 0
self.directories_reused = 0
self.directories_checked = 0
use_backupdb = not options["no-backupdb"]
options.backupdb = None
if use_backupdb:
bdbfile = os.path.join(options["node-directory"],
"private", "backupdb.sqlite")
bdbfile = os.path.abspath(bdbfile)
options.backupdb = backupdb.get_backupdb(bdbfile)
def run(self):
options = self.options
nodeurl = options['node-url']
from_dir = options.from_dir
to_dir = options.to_dir
self.verbosity = 1
if options['quiet']:
self.verbosity = 0
if options['verbose']:
self.verbosity = 2
stdin = options.stdin
stdout = options.stdout
stderr = options.stderr
rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
if path:
to_url += escape_path(path)
if not to_url.endswith("/"):
to_url += "/"
self.backupdb = None
use_backupdb = not options["no-backupdb"]
if use_backupdb:
bdbfile = os.path.join(options["node-directory"],
"private", "backupdb.sqlite")
bdbfile = os.path.abspath(bdbfile)
self.backupdb = backupdb.get_backupdb(bdbfile)
archives_url = to_url + "Archives/"
latest_url = to_url + "Latest"
rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
if path:
to_url += escape_path(path)
if not to_url.endswith("/"):
to_url += "/"
# first step: make sure the target directory exists, as well as the
# Archives/ subdirectory.
resp = do_http("GET", archives_url + "?t=json")
if resp.status == 404:
resp = do_http("POST", archives_url + "?t=mkdir")
if resp.status != 200:
print >>stderr, "Unable to create target directory: %s %s %s" % \
(resp.status, resp.reason, resp.read())
return 1
archives_dir = {}
else:
jdata = simplejson.load(resp)
(otype, attrs) = jdata
archives_dir = attrs["children"]
archives_url = to_url + "Archives/"
latest_url = to_url + "Latest"
# second step: locate the most recent backup in TODIR/Archives/*
latest_backup_time = 0
latest_backup_name = None
latest_backup_dircap = None
# first step: make sure the target directory exists, as well as the
# Archives/ subdirectory.
resp = do_http("GET", archives_url + "?t=json")
if resp.status == 404:
resp = do_http("POST", archives_url + "?t=mkdir")
if resp.status != 200:
print >>stderr, "Unable to create target directory: %s %s %s" % \
(resp.status, resp.reason, resp.read())
return 1
archives_dir = {}
else:
jdata = simplejson.load(resp)
(otype, attrs) = jdata
archives_dir = attrs["children"]
# we have various time formats. The allmydata.com windows backup tool
# appears to create things like "2008-11-16 10.34 PM". This script
# creates things like "2009-11-16--17.34Z".
for archive_name in archives_dir.keys():
if archives_dir[archive_name][0] != "dirnode":
continue
when = parse_old_timestamp(archive_name, options)
if when is not None:
if when > latest_backup_time:
latest_backup_time = when
latest_backup_name = archive_name
latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
# second step: locate the most recent backup in TODIR/Archives/*
latest_backup_time = 0
latest_backup_name = None
latest_backup_dircap = None
# third step: process the tree
new_backup_dircap = Node().process(options.from_dir,
latest_backup_dircap,
options)
print >>stdout, "new backup done"
# we have various time formats. The allmydata.com windows backup tool
# appears to create things like "2008-11-16 10.34 PM". This script
# creates things like "2009-11-16--17.34Z".
for archive_name in archives_dir.keys():
if archives_dir[archive_name][0] != "dirnode":
continue
when = parse_old_timestamp(archive_name, options)
if when is not None:
if when > latest_backup_time:
latest_backup_time = when
latest_backup_name = archive_name
latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
# fourth: attach the new backup to the list
new_readonly_backup_dircap = readonly(new_backup_dircap)
now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
# third step: process the tree
new_backup_dircap = self.process(options.from_dir, latest_backup_dircap)
put_child(archives_url, now, new_readonly_backup_dircap)
put_child(to_url, "Latest", new_readonly_backup_dircap)
# fourth: attach the new backup to the list
new_readonly_backup_dircap = readonly(new_backup_dircap)
now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
print >>stdout, "backup done"
# done!
return 0
put_child(archives_url, now, new_readonly_backup_dircap)
put_child(to_url, "Latest", new_readonly_backup_dircap)
if self.verbosity >= 1:
print >>stdout, (" %d files uploaded (%d reused), "
"%d directories created (%d reused)"
% (self.files_uploaded,
self.files_reused,
self.directories_created,
self.directories_reused))
if self.verbosity >= 2:
print >>stdout, (" %d files checked, %d directories checked, "
"%d directories read"
% (self.files_checked,
self.directories_checked,
self.directories_read))
print >>stdout, " backup done"
# done!
return 0
class Node:
def verboseprint(self, msg):
if self.options["verbose"]:
if self.verbosity >= 2:
print >>self.options.stdout, msg
def process(self, localpath, olddircap, options):
def process(self, localpath, olddircap):
# returns newdircap
self.options = options
self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap))
olddircontents = {}
if olddircap:
olddircontents = readdir(olddircap, options)
olddircontents = self.readdir(olddircap)
newdircontents = {} # childname -> (type, rocap, metadata)
for child in os.listdir(localpath):
@ -234,7 +236,8 @@ class Node:
oldchildcap = None
if olddircontents is not None and child in olddircontents:
oldchildcap = olddircontents[child][1]
newchilddircap = self.recurse(childpath, oldchildcap)
# recurse on the child directory
newchilddircap = self.process(childpath, oldchildcap)
newdircontents[child] = ("dirnode", newchilddircap, metadata)
elif os.path.isfile(childpath):
newfilecap, metadata = self.upload(childpath)
@ -248,25 +251,21 @@ class Node:
):
self.verboseprint(" %s not changed, re-using old directory" % localpath)
# yay! they're identical!
self.directories_reused += 1
return olddircap
else:
self.verboseprint(" %s changed, making new directory" % localpath)
# something changed, or there was no previous directory, so we
# must make a new directory
newdircap = mkdir(newdircontents, options)
newdircap = mkdir(newdircontents, self.options)
self.directories_created += 1
return readonly(newdircap)
def recurse(self, localpath, olddircap):
n = self.__class__()
return n.process(localpath, olddircap, self.options)
def check_backupdb(self, childpath):
if not self.options.backupdb:
if not self.backupdb:
return True, None
use_timestamps = not self.options["ignore-timestamps"]
bdb = self.options.backupdb
r = bdb.check_file(childpath, use_timestamps)
r = self.backupdb.check_file(childpath, use_timestamps)
if not r.was_uploaded():
return True, r
@ -281,6 +280,7 @@ class Node:
self.verboseprint("checking %s" % filecap)
nodeurl = self.options['node-url']
checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap)
self.files_checked += 1
resp = do_http("POST", checkurl)
if resp.status != 200:
# can't check, so we must assume it's bad
@ -295,6 +295,25 @@ class Node:
r.did_check_healthy(cr)
return False, r
def readdir(self, dircap):
# returns a dict of (childname: (type, readcap, metadata)), or None
# if the dircap didn't point to a directory
self.directories_read += 1
url = self.options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
resp = do_http("GET", url)
if resp.status != 200:
raiseHTTPError("Error during directory GET", resp)
jd = simplejson.load(resp)
ntype, ndata = jd
if ntype != "dirnode":
return None
contents = {}
for (childname, (childtype, childdata)) in ndata["children"].items():
contents[childname] = (childtype,
str(childdata["ro_uri"]),
childdata["metadata"])
return contents
def upload(self, childpath):
#self.verboseprint("uploading %s.." % childpath)
metadata = get_local_metadata(childpath)
@ -316,9 +335,14 @@ class Node:
if bdb_results:
bdb_results.did_upload(filecap)
self.files_uploaded += 1
return filecap, metadata
else:
self.verboseprint("skipping %s.." % childpath)
self.files_reused += 1
return bdb_results.was_uploaded(), metadata
def backup(options):
bu = BackerUpper(options)
return bu.run()

View File

@ -5,6 +5,7 @@ from twisted.trial import unittest
from cStringIO import StringIO
import urllib
import time
import re
from allmydata.util import fileutil, hashutil
from allmydata import uri
@ -16,7 +17,7 @@ _hush_pyflakes = [tahoe_ls, tahoe_get, tahoe_put, tahoe_rm, tahoe_cp]
from allmydata.scripts.common import DEFAULT_ALIAS, get_aliases
from allmydata.scripts import cli, debug, runner
from allmydata.scripts import cli, debug, runner, backupdb
from allmydata.test.common import SystemTestMixin
from twisted.internet import threads # CLI tests use deferToThread
@ -627,9 +628,23 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
f.write(data)
f.close()
def count_output(self, out):
mo = re.search(r"(\d)+ files uploaded \((\d+) reused\), (\d+) directories created \((\d+) reused\)", out)
return [int(s) for s in mo.groups()]
def count_output2(self, out):
mo = re.search(r"(\d)+ files checked, (\d+) directories checked, (\d+) directories read", out)
return [int(s) for s in mo.groups()]
def test_backup(self):
self.basedir = os.path.dirname(self.mktemp())
# is the backupdb available? If so, we test that a second backup does
# not create new directories.
hush = StringIO()
have_bdb = backupdb.get_backupdb(os.path.join(self.basedir, "dbtest"),
hush)
# create a small local directory with a couple of files
source = os.path.join(self.basedir, "home")
fileutil.make_dirs(os.path.join(source, "empty"))
@ -643,7 +658,15 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
def _check0((rc, out, err)):
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
fu, fr, dc, dr = self.count_output(out)
# foo.txt, bar.txt, blah.txt
self.failUnlessEqual(fu, 3)
self.failUnlessEqual(fr, 0)
# empty, home, home/parent, home/parent/subdir
self.failUnlessEqual(dc, 4)
self.failUnlessEqual(dr, 0)
d.addCallback(_check0)
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups"))
def _check1((rc, out, err)):
self.failUnlessEqual(err, "")
@ -678,12 +701,62 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups"))
def _check4a((rc, out, err)):
# second backup should reuse everything, if the backupdb is
# available
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
if have_bdb:
fu, fr, dc, dr = self.count_output(out)
# foo.txt, bar.txt, blah.txt
self.failUnlessEqual(fu, 0)
self.failUnlessEqual(fr, 3)
# empty, home, home/parent, home/parent/subdir
self.failUnlessEqual(dc, 0)
self.failUnlessEqual(dr, 4)
d.addCallback(_check4a)
if have_bdb:
# sneak into the backupdb, crank back the "last checked"
# timestamp to force a check on all files
def _reset_last_checked(res):
dbfile = os.path.join(self.basedir,
"client0", "private", "backupdb.sqlite")
self.failUnless(os.path.exists(dbfile), dbfile)
bdb = backupdb.get_backupdb(dbfile)
bdb.cursor.execute("UPDATE last_upload SET last_checked=0")
bdb.connection.commit()
d.addCallback(_reset_last_checked)
d.addCallback(lambda res:
self.do_cli("backup", "--verbose", source, "tahoe:backups"))
def _check4b((rc, out, err)):
# we should check all files, and re-use all of them. None of
# the directories should have been changed.
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
fu, fr, dc, dr = self.count_output(out)
fchecked, dchecked, dread = self.count_output2(out)
self.failUnlessEqual(fchecked, 3)
self.failUnlessEqual(fu, 0)
self.failUnlessEqual(fr, 3)
# TODO: backupdb doesn't do dirs yet; when it does, this will
# change to dchecked=4, and maybe dread=0
self.failUnlessEqual(dchecked, 0)
self.failUnlessEqual(dread, 4)
self.failUnlessEqual(dc, 0)
self.failUnlessEqual(dr, 4)
d.addCallback(_check4b)
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
def _check5((rc, out, err)):
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
self.new_archives = out.split()
self.failUnlessEqual(len(self.new_archives), 2)
self.failUnlessEqual(len(self.new_archives), 3)
# the original backup should still be the oldest (i.e. sorts
# alphabetically towards the beginning)
self.failUnlessEqual(sorted(self.new_archives)[0],
self.old_archives[0])
d.addCallback(_check5)
@ -701,12 +774,27 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
self.writeto("empty", "imagine nothing being here")
return self.do_cli("backup", source, "tahoe:backups")
d.addCallback(_modify)
def _check5a((rc, out, err)):
# second backup should reuse bar.txt (if backupdb is available),
# and upload the rest. None of the directories can be reused.
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
if have_bdb:
fu, fr, dc, dr = self.count_output(out)
# new foo.txt, surprise file, subfile, empty
self.failUnlessEqual(fu, 4)
# old bar.txt
self.failUnlessEqual(fr, 1)
# home, parent, subdir, blah.txt, surprisedir
self.failUnlessEqual(dc, 5)
self.failUnlessEqual(dr, 0)
d.addCallback(_check5a)
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
def _check6((rc, out, err)):
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
self.new_archives = out.split()
self.failUnlessEqual(len(self.new_archives), 3)
self.failUnlessEqual(len(self.new_archives), 4)
self.failUnlessEqual(sorted(self.new_archives)[0],
self.old_archives[0])
d.addCallback(_check6)
@ -724,5 +812,19 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
self.failUnlessEqual(out, "foo")
d.addCallback(_check8)
d.addCallback(lambda res:
self.do_cli("backup", "--no-backupdb", source, "tahoe:backups"))
def _check9((rc, out, err)):
# --no-backupdb means re-upload everything. We still get to
# re-use the directories, since nothing changed.
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
fu, fr, dc, dr = self.count_output(out)
self.failUnlessEqual(fu, 5)
self.failUnlessEqual(fr, 0)
self.failUnlessEqual(dc, 0)
self.failUnlessEqual(dr, 5)
d.addCallback(_check9)
return d