mirror of
https://github.com/tahoe-lafs/tahoe-lafs.git
synced 2025-04-07 10:56:49 +00:00
#598: add cli+backupdb tests, improve user display, update docs, move docs out of proposed/
This commit is contained in:
parent
add0e1f2f8
commit
ca32db5b39
160
docs/backupdb.txt
Normal file
160
docs/backupdb.txt
Normal file
@ -0,0 +1,160 @@
|
||||
= The Tahoe BackupDB =
|
||||
|
||||
To speed up backup operations, Tahoe maintains a small database known as the
|
||||
"backupdb". This is used to avoid re-uploading files which have already been
|
||||
uploaded recently.
|
||||
|
||||
This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
|
||||
single-file database. It is used by the "tahoe backup" command (unless the
|
||||
--no-backupdb option is included). In the future, it will also be used by
|
||||
"tahoe mirror", and by "tahoe cp" when the --use-backupdb option is included.
|
||||
|
||||
The purpose of this database is specifically to manage the file-to-cap
|
||||
translation (the "upload" step). It does not address directory updates. A
|
||||
future version will include a directory cache.
|
||||
|
||||
The overall goal of optimizing backup is to reduce the work required when the
|
||||
source disk has not changed since the last backup. In the ideal case, running
|
||||
"tahoe backup" twice in a row, with no intervening changes to the disk, will
|
||||
not require any network traffic.
|
||||
|
||||
This database is optional. If it is deleted, the worst effect is that a
|
||||
subsequent backup operation may use more effort (network bandwidth, CPU
|
||||
cycles, and disk IO) than it would have without the backupdb.
|
||||
|
||||
The database uses sqlite3, which is included as part of the standard python
|
||||
library with python2.5 and later. For python2.4, please install the
|
||||
"pysqlite2" package (which, despite the name, actually provides sqlite3
|
||||
rather than sqlite2).
|
||||
|
||||
== Schema ==
|
||||
|
||||
The database contains the following tables:
|
||||
|
||||
CREATE TABLE version
|
||||
(
|
||||
version integer # contains one row, set to 1
|
||||
);
|
||||
|
||||
CREATE TABLE last_upload
|
||||
(
|
||||
path varchar(1024), PRIMARY KEY -- index, this is os.path.abspath(fn)
|
||||
size integer, -- os.stat(fn)[stat.ST_SIZE]
|
||||
mtime number, -- os.stat(fn)[stat.ST_MTIME]
|
||||
ctime number, -- os.stat(fn)[stat.ST_MTIME]
|
||||
fileid integer
|
||||
);
|
||||
|
||||
CREATE TABLE caps
|
||||
(
|
||||
fileid integer PRIMARY KEY AUTOINCREMENT,
|
||||
filecap varchar(256) UNIQUE -- URI:CHK:...
|
||||
);
|
||||
|
||||
CREATE TABLE last_upload
|
||||
(
|
||||
fileid INTEGER PRIMARY KEY,
|
||||
last_uploaded TIMESTAMP,
|
||||
last_checked TIMESTAMP
|
||||
);
|
||||
|
||||
Notes: if we extend the backupdb to assist with directory maintenance (see
|
||||
below), we may need paths in multiple places, so it would make sense to
|
||||
create a table for them, and change the last_upload table to refer to a
|
||||
pathid instead of an absolute path:
|
||||
|
||||
CREATE TABLE paths
|
||||
(
|
||||
path varchar(1024) UNIQUE, -- index
|
||||
pathid integer PRIMARY KEY AUTOINCREMENT
|
||||
);
|
||||
|
||||
== Operation ==
|
||||
|
||||
The upload process starts with a pathname (like ~/.emacs) and wants to end up
|
||||
with a file-cap (like URI:CHK:...).
|
||||
|
||||
The first step is to convert the path to an absolute form
|
||||
(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
|
||||
not present in this table, the file must be uploaded. The upload process is:
|
||||
|
||||
1. record the file's size, creation time, and modification time
|
||||
2. upload the file into the grid, obtaining an immutable file read-cap
|
||||
3. add an entry to the 'caps' table, with the read-cap, to get a fileid
|
||||
4. add an entry to the 'last_upload' table, with the current time
|
||||
5. add an entry to the 'local_files' table, with the fileid, the path,
|
||||
and the local file's size/ctime/mtime
|
||||
|
||||
If the path *is* present in 'last_upload', the easy-to-compute identifying
|
||||
information is compared: file size and ctime/mtime. If these differ, the file
|
||||
must be uploaded. The row is removed from the last_upload table, and the
|
||||
upload process above is followed.
|
||||
|
||||
If the path is present but ctime or mtime differs, the file may have changed.
|
||||
If the size differs, then the file has certainly changed. At this point, a
|
||||
future version of the "backup" command might hash the file and look for a
|
||||
match in an as-yet-defined table, in the hopes that the file has simply been
|
||||
moved from somewhere else on the disk. This enhancement requires changes to
|
||||
the Tahoe upload API before it can be significantly more efficient than
|
||||
simply handing the file to Tahoe and relying upon the normal convergence to
|
||||
notice the similarity.
|
||||
|
||||
If ctime, mtime, or size is different, the client will upload the file, as
|
||||
above.
|
||||
|
||||
If these identifiers are the same, the client will assume that the file is
|
||||
unchanged (unless the --ignore-timestamps option is provided, in which case
|
||||
the client always re-uploads the file), and it may be allowed to skip the
|
||||
upload. For safety, however, we require the client periodically perform a
|
||||
filecheck on these probably-already-uploaded files, and re-upload anything
|
||||
that doesn't look healthy. The client looks the fileid up in the
|
||||
'last_upload' table, to see how long it has been since the file was last
|
||||
checked.
|
||||
|
||||
A "random early check" algorithm should be used, in which a check is
|
||||
performed with a probability that increases with the age of the previous
|
||||
results. E.g. files that were last checked within a month are not checked,
|
||||
files that were checked 5 weeks ago are re-checked with 25% probability, 6
|
||||
weeks with 50%, more than 8 weeks are always checked. This reduces the
|
||||
"thundering herd" of filechecks-on-everything that would otherwise result
|
||||
when a backup operation is run one month after the original backup. If a
|
||||
filecheck reveals the file is not healthy, it is re-uploaded.
|
||||
|
||||
If the filecheck shows the file is healthy, or if the filecheck was skipped,
|
||||
the client gets to skip the upload, and uses the previous filecap (from the
|
||||
'caps' table) to add to the parent directory.
|
||||
|
||||
If a new file is uploaded, a new entry is put in the 'caps' and 'last_upload'
|
||||
table, and an entry is made in the 'local_files' table to reflect the mapping
|
||||
from local disk pathname to uploaded filecap. If an old file is re-uploaded,
|
||||
the 'last_upload' entry is updated with the new timestamps. If an old file is
|
||||
checked and found healthy, the 'last_upload' entry is updated.
|
||||
|
||||
Relying upon timestamps is a compromise between efficiency and safety: a file
|
||||
which is modified without changing the timestamp or size will be treated as
|
||||
unmodified, and the "tahoe backup" command will not copy the new contents
|
||||
into the grid. The --no-timestamps can be used to disable this optimization,
|
||||
forcing every byte of the file to be hashed and encoded.
|
||||
|
||||
== DIRECTORY CACHING ==
|
||||
|
||||
A future version of the backupdb will also record a secure hash of the most
|
||||
recent contents of each tahoe directory that was used in the last backup run.
|
||||
The directories created by the "tahoe backup" command are all read-only, so
|
||||
it should be difficult to violate the assumption that these directories are
|
||||
unmodified since the previous pass. In the future, Tahoe will provide truly
|
||||
immutable directories, making this assumption even more solid.
|
||||
|
||||
In the current implementation, when the backup algorithm is faced with the
|
||||
decision to either create a new directory or share an old one, it must read
|
||||
the contents of the old directory to compare it against the desired new
|
||||
contents. This means that a "null backup" (performing a backup when nothing
|
||||
has been changed) must still read every Tahoe directory from the previous
|
||||
backup.
|
||||
|
||||
With a directory-caching backupdb, these directory reads will be bypassed,
|
||||
and the null backup will use minimal network bandwidth: one directory read
|
||||
and two modifies. The Archives/ directory must be read to locate the latest
|
||||
backup, and must be modified to add a new snapshot, and the Latest/ directory
|
||||
will be updated to point to that same snapshot.
|
||||
|
@ -1,188 +0,0 @@
|
||||
= PRELIMINARY =
|
||||
|
||||
This document is a description of a feature which is not yet implemented,
|
||||
added here to solicit feedback and to describe future plans. This document is
|
||||
subject to revision or withdrawal at any moment. Until this notice is
|
||||
removed, consider this entire document to be a figment of your imagination.
|
||||
|
||||
= The Tahoe BackupDB =
|
||||
|
||||
To speed up backup operations, Tahoe maintains a small database known as the
|
||||
"backupdb". This is used to avoid re-uploading files which have already been
|
||||
uploaded recently.
|
||||
|
||||
This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
|
||||
single-file database. It is used by the "tahoe backup" command, and by the
|
||||
"tahoe cp" command when the --use-backupdb option is included.
|
||||
|
||||
The purpose of this database is specifically to manage the file-to-cap
|
||||
translation (the "upload" step). It does not address directory updates.
|
||||
|
||||
The overall goal of optimizing backup is to reduce the work required when the
|
||||
source disk has not changed since the last backup. In the ideal case, running
|
||||
"tahoe backup" twice in a row, with no intervening changes to the disk, will
|
||||
not require any network traffic.
|
||||
|
||||
This database is optional. If it is deleted, the worst effect is that a
|
||||
subsequent backup operation may use more effort (network bandwidth, CPU
|
||||
cycles, and disk IO) than it would have without the backupdb.
|
||||
|
||||
== Schema ==
|
||||
|
||||
The database contains the following tables:
|
||||
|
||||
CREATE TABLE version
|
||||
(
|
||||
version integer # contains one row, set to 0
|
||||
);
|
||||
|
||||
CREATE TABLE last_upload
|
||||
(
|
||||
path varchar(1024), # index, this is os.path.abspath(fn)
|
||||
size integer, # os.stat(fn)[stat.ST_SIZE]
|
||||
mtime number, # os.stat(fn)[stat.ST_MTIME]
|
||||
fileid integer
|
||||
);
|
||||
|
||||
CREATE TABLE caps
|
||||
(
|
||||
fileid integer PRIMARY KEY AUTOINCREMENT,
|
||||
filecap varchar(256), # URI:CHK:...
|
||||
last_uploaded timestamp,
|
||||
last_checked timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE keys_to_files
|
||||
(
|
||||
readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap
|
||||
fileid integer
|
||||
);
|
||||
|
||||
Notes: if we extend the backupdb to assist with directory maintenance (see
|
||||
below), we may need paths in multiple places, so it would make sense to
|
||||
create a table for them, and change the last_upload table to refer to a
|
||||
pathid instead of an absolute path:
|
||||
|
||||
CREATE TABLE paths
|
||||
(
|
||||
path varchar(1024), # index
|
||||
pathid integer PRIMARY KEY AUTOINCREMENT
|
||||
);
|
||||
|
||||
== Operation ==
|
||||
|
||||
The upload process starts with a pathname (like ~/.emacs) and wants to end up
|
||||
with a file-cap (like URI:CHK:...).
|
||||
|
||||
The first step is to convert the path to an absolute form
|
||||
(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
|
||||
not present in this table, the file must be uploaded. The upload process is:
|
||||
|
||||
1. record the file's size and modification time
|
||||
2. upload the file into the grid, obtaining an immutable file read-cap
|
||||
3. add an entry to the 'caps' table, with the read-cap, and the current time
|
||||
4. extract the read-key from the read-cap, add an entry to 'keys_to_files'
|
||||
5. add an entry to 'last_upload'
|
||||
|
||||
If the path *is* present in 'last_upload', the easy-to-compute identifying
|
||||
information is compared: file size and modification time. If these differ,
|
||||
the file must be uploaded. The row is removed from the last_upload table, and
|
||||
the upload process above is followed.
|
||||
|
||||
If the path is present but the mtime differs, the file may have changed. If
|
||||
the size differs, then the file has certainly changed. The client will
|
||||
compute the CHK read-key for the file by hashing its contents, using exactly
|
||||
the same algorithm as the node does when it uploads a file (including
|
||||
~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to
|
||||
see if this file has been uploaded before: perhaps the file was moved from
|
||||
elsewhere on the disk. If no match is found, the file must be uploaded, so
|
||||
the upload process above is follwed.
|
||||
|
||||
If the read-key *is* found in the 'keys_to_files' table, then the file has
|
||||
been uploaded before, but we should consider performing a file check / verify
|
||||
operation to make sure we can skip a new upload. The fileid is used to
|
||||
retrieve the entry from the 'caps' table, and the last_checked timestamp is
|
||||
examined. If this timestamp is too old, a filecheck operation should be
|
||||
performed, and the file repaired if the results are not satisfactory. A
|
||||
"random early check" algorithm should be used, in which a check is performed
|
||||
with a probability that increases with the age of the previous results. E.g.
|
||||
files that were last checked within a month are not checked, files that were
|
||||
checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%,
|
||||
more than 8 weeks are always checked. This reduces the "thundering herd" of
|
||||
filechecks-on-everything that would otherwise result when a backup operation
|
||||
is run one month after the original backup. The readkey can be submitted to
|
||||
the upload operation, to remove a duplicate hashing pass through the file and
|
||||
reduce the disk IO. In a future version of the storage server protocol, this
|
||||
could also improve the "streamingness" of the upload process.
|
||||
|
||||
If the file's size and mtime match, the file is considered to be unmodified,
|
||||
and the last_checked timestamp from the 'caps' table is examined as above
|
||||
(possibly resulting in a filecheck or repair). The --no-timestamps option
|
||||
disables this check: this removes the danger of false-positives (i.e. not
|
||||
uploading a new file, because it appeared to be the same as a previously
|
||||
uploaded one), but increases the amount of disk IO that must be performed
|
||||
(every byte of every file must be hashed to compute the readkey).
|
||||
|
||||
This algorithm is summarized in the following pseudocode:
|
||||
|
||||
{{{
|
||||
def backup(path):
|
||||
abspath = os.path.abspath(path)
|
||||
result = check_for_upload(abspath)
|
||||
now = time.time()
|
||||
if result == MUST_UPLOAD:
|
||||
filecap = upload(abspath, key=result.readkey)
|
||||
fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)",
|
||||
(filecap, now, now))
|
||||
db("INSERT INTO keys_to_files", (result.readkey, filecap))
|
||||
db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid))
|
||||
if result in (MOVED, ALREADY_UPLOADED):
|
||||
age = now - result.last_checked
|
||||
probability = (age - 1*MONTH) / 1*MONTH
|
||||
probability = min(max(probability, 0.0), 1.0)
|
||||
if random.random() < probability:
|
||||
do_filecheck(result.filecap)
|
||||
if result == MOVED:
|
||||
db("INSERT INTO last_upload",
|
||||
(abspath, current_size, current_mtime, result.fileid))
|
||||
|
||||
|
||||
def check_for_upload(abspath):
|
||||
row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s"
|
||||
% abspath)
|
||||
if not row:
|
||||
return check_moved(abspath)
|
||||
current_size = os.stat(abspath)[stat.ST_SIZE]
|
||||
current_mtime = os.stat(abspath)[stat.ST_MTIME]
|
||||
(last_size,last_mtime,last_fileid) = row
|
||||
if file_changed(current_size, last_size, current_mtime, last_mtime):
|
||||
db("DELETE FROM last_upload WHERE fileid=%s" % fileid)
|
||||
return check_moved(abspath)
|
||||
(filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
|
||||
" WHERE fileid == %s" % last_fileid)
|
||||
return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked)
|
||||
|
||||
def file_changed(current_size, last_size, current_mtime, last_mtime):
|
||||
if last_size != current_size:
|
||||
return True
|
||||
if NO_TIMESTAMPS:
|
||||
return True
|
||||
if last_mtime != current_mtime:
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_moved(abspath):
|
||||
readkey = hash_with_convergence(abspath)
|
||||
fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey)
|
||||
if not fileid:
|
||||
return MUST_UPLOAD(readkey=readkey)
|
||||
(filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
|
||||
" WHERE fileid == %s" % fileid)
|
||||
return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked)
|
||||
|
||||
def do_filecheck(filecap):
|
||||
health = check(filecap)
|
||||
if health < DESIRED:
|
||||
repair(filecap)
|
||||
|
||||
}}}
|
@ -50,24 +50,6 @@ def parse_old_timestamp(s, options):
|
||||
print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s
|
||||
return None
|
||||
|
||||
def readdir(dircap, options):
|
||||
# returns a dict of (childname: (type, readcap, metadata)), or None if the
|
||||
# dircap didn't point to a directory
|
||||
url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
|
||||
resp = do_http("GET", url)
|
||||
if resp.status != 200:
|
||||
raiseHTTPError("Error during directory GET", resp)
|
||||
jd = simplejson.load(resp)
|
||||
ntype, ndata = jd
|
||||
if ntype != "dirnode":
|
||||
return None
|
||||
contents = {}
|
||||
for (childname, (childtype, childdata)) in ndata["children"].items():
|
||||
contents[childname] = (childtype,
|
||||
str(childdata["ro_uri"]),
|
||||
childdata["metadata"])
|
||||
return contents
|
||||
|
||||
def get_local_metadata(path):
|
||||
metadata = {}
|
||||
|
||||
@ -131,100 +113,120 @@ def directory_is_changed(a, b):
|
||||
return True
|
||||
return False
|
||||
|
||||
def backup(options):
|
||||
nodeurl = options['node-url']
|
||||
from_dir = options.from_dir
|
||||
to_dir = options.to_dir
|
||||
if options['quiet']:
|
||||
verbosity = 0
|
||||
else:
|
||||
verbosity = 2
|
||||
stdin = options.stdin
|
||||
stdout = options.stdout
|
||||
stderr = options.stderr
|
||||
class BackerUpper:
|
||||
def __init__(self, options):
|
||||
self.options = options
|
||||
self.files_uploaded = 0
|
||||
self.files_reused = 0
|
||||
self.files_checked = 0
|
||||
self.directories_read = 0
|
||||
self.directories_created = 0
|
||||
self.directories_reused = 0
|
||||
self.directories_checked = 0
|
||||
|
||||
use_backupdb = not options["no-backupdb"]
|
||||
options.backupdb = None
|
||||
if use_backupdb:
|
||||
bdbfile = os.path.join(options["node-directory"],
|
||||
"private", "backupdb.sqlite")
|
||||
bdbfile = os.path.abspath(bdbfile)
|
||||
options.backupdb = backupdb.get_backupdb(bdbfile)
|
||||
def run(self):
|
||||
options = self.options
|
||||
nodeurl = options['node-url']
|
||||
from_dir = options.from_dir
|
||||
to_dir = options.to_dir
|
||||
self.verbosity = 1
|
||||
if options['quiet']:
|
||||
self.verbosity = 0
|
||||
if options['verbose']:
|
||||
self.verbosity = 2
|
||||
stdin = options.stdin
|
||||
stdout = options.stdout
|
||||
stderr = options.stderr
|
||||
|
||||
rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
|
||||
to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
|
||||
if path:
|
||||
to_url += escape_path(path)
|
||||
if not to_url.endswith("/"):
|
||||
to_url += "/"
|
||||
self.backupdb = None
|
||||
use_backupdb = not options["no-backupdb"]
|
||||
if use_backupdb:
|
||||
bdbfile = os.path.join(options["node-directory"],
|
||||
"private", "backupdb.sqlite")
|
||||
bdbfile = os.path.abspath(bdbfile)
|
||||
self.backupdb = backupdb.get_backupdb(bdbfile)
|
||||
|
||||
archives_url = to_url + "Archives/"
|
||||
latest_url = to_url + "Latest"
|
||||
rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
|
||||
to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
|
||||
if path:
|
||||
to_url += escape_path(path)
|
||||
if not to_url.endswith("/"):
|
||||
to_url += "/"
|
||||
|
||||
# first step: make sure the target directory exists, as well as the
|
||||
# Archives/ subdirectory.
|
||||
resp = do_http("GET", archives_url + "?t=json")
|
||||
if resp.status == 404:
|
||||
resp = do_http("POST", archives_url + "?t=mkdir")
|
||||
if resp.status != 200:
|
||||
print >>stderr, "Unable to create target directory: %s %s %s" % \
|
||||
(resp.status, resp.reason, resp.read())
|
||||
return 1
|
||||
archives_dir = {}
|
||||
else:
|
||||
jdata = simplejson.load(resp)
|
||||
(otype, attrs) = jdata
|
||||
archives_dir = attrs["children"]
|
||||
archives_url = to_url + "Archives/"
|
||||
latest_url = to_url + "Latest"
|
||||
|
||||
# second step: locate the most recent backup in TODIR/Archives/*
|
||||
latest_backup_time = 0
|
||||
latest_backup_name = None
|
||||
latest_backup_dircap = None
|
||||
# first step: make sure the target directory exists, as well as the
|
||||
# Archives/ subdirectory.
|
||||
resp = do_http("GET", archives_url + "?t=json")
|
||||
if resp.status == 404:
|
||||
resp = do_http("POST", archives_url + "?t=mkdir")
|
||||
if resp.status != 200:
|
||||
print >>stderr, "Unable to create target directory: %s %s %s" % \
|
||||
(resp.status, resp.reason, resp.read())
|
||||
return 1
|
||||
archives_dir = {}
|
||||
else:
|
||||
jdata = simplejson.load(resp)
|
||||
(otype, attrs) = jdata
|
||||
archives_dir = attrs["children"]
|
||||
|
||||
# we have various time formats. The allmydata.com windows backup tool
|
||||
# appears to create things like "2008-11-16 10.34 PM". This script
|
||||
# creates things like "2009-11-16--17.34Z".
|
||||
for archive_name in archives_dir.keys():
|
||||
if archives_dir[archive_name][0] != "dirnode":
|
||||
continue
|
||||
when = parse_old_timestamp(archive_name, options)
|
||||
if when is not None:
|
||||
if when > latest_backup_time:
|
||||
latest_backup_time = when
|
||||
latest_backup_name = archive_name
|
||||
latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
|
||||
# second step: locate the most recent backup in TODIR/Archives/*
|
||||
latest_backup_time = 0
|
||||
latest_backup_name = None
|
||||
latest_backup_dircap = None
|
||||
|
||||
# third step: process the tree
|
||||
new_backup_dircap = Node().process(options.from_dir,
|
||||
latest_backup_dircap,
|
||||
options)
|
||||
print >>stdout, "new backup done"
|
||||
# we have various time formats. The allmydata.com windows backup tool
|
||||
# appears to create things like "2008-11-16 10.34 PM". This script
|
||||
# creates things like "2009-11-16--17.34Z".
|
||||
for archive_name in archives_dir.keys():
|
||||
if archives_dir[archive_name][0] != "dirnode":
|
||||
continue
|
||||
when = parse_old_timestamp(archive_name, options)
|
||||
if when is not None:
|
||||
if when > latest_backup_time:
|
||||
latest_backup_time = when
|
||||
latest_backup_name = archive_name
|
||||
latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
|
||||
|
||||
# fourth: attach the new backup to the list
|
||||
new_readonly_backup_dircap = readonly(new_backup_dircap)
|
||||
now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
|
||||
# third step: process the tree
|
||||
new_backup_dircap = self.process(options.from_dir, latest_backup_dircap)
|
||||
|
||||
put_child(archives_url, now, new_readonly_backup_dircap)
|
||||
put_child(to_url, "Latest", new_readonly_backup_dircap)
|
||||
# fourth: attach the new backup to the list
|
||||
new_readonly_backup_dircap = readonly(new_backup_dircap)
|
||||
now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
|
||||
|
||||
print >>stdout, "backup done"
|
||||
# done!
|
||||
return 0
|
||||
put_child(archives_url, now, new_readonly_backup_dircap)
|
||||
put_child(to_url, "Latest", new_readonly_backup_dircap)
|
||||
|
||||
if self.verbosity >= 1:
|
||||
print >>stdout, (" %d files uploaded (%d reused), "
|
||||
"%d directories created (%d reused)"
|
||||
% (self.files_uploaded,
|
||||
self.files_reused,
|
||||
self.directories_created,
|
||||
self.directories_reused))
|
||||
if self.verbosity >= 2:
|
||||
print >>stdout, (" %d files checked, %d directories checked, "
|
||||
"%d directories read"
|
||||
% (self.files_checked,
|
||||
self.directories_checked,
|
||||
self.directories_read))
|
||||
print >>stdout, " backup done"
|
||||
# done!
|
||||
return 0
|
||||
|
||||
class Node:
|
||||
def verboseprint(self, msg):
|
||||
if self.options["verbose"]:
|
||||
if self.verbosity >= 2:
|
||||
print >>self.options.stdout, msg
|
||||
|
||||
def process(self, localpath, olddircap, options):
|
||||
def process(self, localpath, olddircap):
|
||||
# returns newdircap
|
||||
self.options = options
|
||||
|
||||
self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap))
|
||||
olddircontents = {}
|
||||
if olddircap:
|
||||
olddircontents = readdir(olddircap, options)
|
||||
olddircontents = self.readdir(olddircap)
|
||||
|
||||
newdircontents = {} # childname -> (type, rocap, metadata)
|
||||
for child in os.listdir(localpath):
|
||||
@ -234,7 +236,8 @@ class Node:
|
||||
oldchildcap = None
|
||||
if olddircontents is not None and child in olddircontents:
|
||||
oldchildcap = olddircontents[child][1]
|
||||
newchilddircap = self.recurse(childpath, oldchildcap)
|
||||
# recurse on the child directory
|
||||
newchilddircap = self.process(childpath, oldchildcap)
|
||||
newdircontents[child] = ("dirnode", newchilddircap, metadata)
|
||||
elif os.path.isfile(childpath):
|
||||
newfilecap, metadata = self.upload(childpath)
|
||||
@ -248,25 +251,21 @@ class Node:
|
||||
):
|
||||
self.verboseprint(" %s not changed, re-using old directory" % localpath)
|
||||
# yay! they're identical!
|
||||
self.directories_reused += 1
|
||||
return olddircap
|
||||
else:
|
||||
self.verboseprint(" %s changed, making new directory" % localpath)
|
||||
# something changed, or there was no previous directory, so we
|
||||
# must make a new directory
|
||||
newdircap = mkdir(newdircontents, options)
|
||||
newdircap = mkdir(newdircontents, self.options)
|
||||
self.directories_created += 1
|
||||
return readonly(newdircap)
|
||||
|
||||
def recurse(self, localpath, olddircap):
|
||||
n = self.__class__()
|
||||
return n.process(localpath, olddircap, self.options)
|
||||
|
||||
|
||||
def check_backupdb(self, childpath):
|
||||
if not self.options.backupdb:
|
||||
if not self.backupdb:
|
||||
return True, None
|
||||
use_timestamps = not self.options["ignore-timestamps"]
|
||||
bdb = self.options.backupdb
|
||||
r = bdb.check_file(childpath, use_timestamps)
|
||||
r = self.backupdb.check_file(childpath, use_timestamps)
|
||||
|
||||
if not r.was_uploaded():
|
||||
return True, r
|
||||
@ -281,6 +280,7 @@ class Node:
|
||||
self.verboseprint("checking %s" % filecap)
|
||||
nodeurl = self.options['node-url']
|
||||
checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap)
|
||||
self.files_checked += 1
|
||||
resp = do_http("POST", checkurl)
|
||||
if resp.status != 200:
|
||||
# can't check, so we must assume it's bad
|
||||
@ -295,6 +295,25 @@ class Node:
|
||||
r.did_check_healthy(cr)
|
||||
return False, r
|
||||
|
||||
def readdir(self, dircap):
|
||||
# returns a dict of (childname: (type, readcap, metadata)), or None
|
||||
# if the dircap didn't point to a directory
|
||||
self.directories_read += 1
|
||||
url = self.options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
|
||||
resp = do_http("GET", url)
|
||||
if resp.status != 200:
|
||||
raiseHTTPError("Error during directory GET", resp)
|
||||
jd = simplejson.load(resp)
|
||||
ntype, ndata = jd
|
||||
if ntype != "dirnode":
|
||||
return None
|
||||
contents = {}
|
||||
for (childname, (childtype, childdata)) in ndata["children"].items():
|
||||
contents[childname] = (childtype,
|
||||
str(childdata["ro_uri"]),
|
||||
childdata["metadata"])
|
||||
return contents
|
||||
|
||||
def upload(self, childpath):
|
||||
#self.verboseprint("uploading %s.." % childpath)
|
||||
metadata = get_local_metadata(childpath)
|
||||
@ -316,9 +335,14 @@ class Node:
|
||||
if bdb_results:
|
||||
bdb_results.did_upload(filecap)
|
||||
|
||||
self.files_uploaded += 1
|
||||
return filecap, metadata
|
||||
|
||||
else:
|
||||
self.verboseprint("skipping %s.." % childpath)
|
||||
self.files_reused += 1
|
||||
return bdb_results.was_uploaded(), metadata
|
||||
|
||||
def backup(options):
|
||||
bu = BackerUpper(options)
|
||||
return bu.run()
|
||||
|
@ -5,6 +5,7 @@ from twisted.trial import unittest
|
||||
from cStringIO import StringIO
|
||||
import urllib
|
||||
import time
|
||||
import re
|
||||
|
||||
from allmydata.util import fileutil, hashutil
|
||||
from allmydata import uri
|
||||
@ -16,7 +17,7 @@ _hush_pyflakes = [tahoe_ls, tahoe_get, tahoe_put, tahoe_rm, tahoe_cp]
|
||||
|
||||
from allmydata.scripts.common import DEFAULT_ALIAS, get_aliases
|
||||
|
||||
from allmydata.scripts import cli, debug, runner
|
||||
from allmydata.scripts import cli, debug, runner, backupdb
|
||||
from allmydata.test.common import SystemTestMixin
|
||||
from twisted.internet import threads # CLI tests use deferToThread
|
||||
|
||||
@ -627,9 +628,23 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
def count_output(self, out):
|
||||
mo = re.search(r"(\d)+ files uploaded \((\d+) reused\), (\d+) directories created \((\d+) reused\)", out)
|
||||
return [int(s) for s in mo.groups()]
|
||||
|
||||
def count_output2(self, out):
|
||||
mo = re.search(r"(\d)+ files checked, (\d+) directories checked, (\d+) directories read", out)
|
||||
return [int(s) for s in mo.groups()]
|
||||
|
||||
def test_backup(self):
|
||||
self.basedir = os.path.dirname(self.mktemp())
|
||||
|
||||
# is the backupdb available? If so, we test that a second backup does
|
||||
# not create new directories.
|
||||
hush = StringIO()
|
||||
have_bdb = backupdb.get_backupdb(os.path.join(self.basedir, "dbtest"),
|
||||
hush)
|
||||
|
||||
# create a small local directory with a couple of files
|
||||
source = os.path.join(self.basedir, "home")
|
||||
fileutil.make_dirs(os.path.join(source, "empty"))
|
||||
@ -643,7 +658,15 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
|
||||
def _check0((rc, out, err)):
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
fu, fr, dc, dr = self.count_output(out)
|
||||
# foo.txt, bar.txt, blah.txt
|
||||
self.failUnlessEqual(fu, 3)
|
||||
self.failUnlessEqual(fr, 0)
|
||||
# empty, home, home/parent, home/parent/subdir
|
||||
self.failUnlessEqual(dc, 4)
|
||||
self.failUnlessEqual(dr, 0)
|
||||
d.addCallback(_check0)
|
||||
|
||||
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups"))
|
||||
def _check1((rc, out, err)):
|
||||
self.failUnlessEqual(err, "")
|
||||
@ -678,12 +701,62 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
|
||||
|
||||
|
||||
d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups"))
|
||||
def _check4a((rc, out, err)):
|
||||
# second backup should reuse everything, if the backupdb is
|
||||
# available
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
if have_bdb:
|
||||
fu, fr, dc, dr = self.count_output(out)
|
||||
# foo.txt, bar.txt, blah.txt
|
||||
self.failUnlessEqual(fu, 0)
|
||||
self.failUnlessEqual(fr, 3)
|
||||
# empty, home, home/parent, home/parent/subdir
|
||||
self.failUnlessEqual(dc, 0)
|
||||
self.failUnlessEqual(dr, 4)
|
||||
d.addCallback(_check4a)
|
||||
|
||||
if have_bdb:
|
||||
# sneak into the backupdb, crank back the "last checked"
|
||||
# timestamp to force a check on all files
|
||||
def _reset_last_checked(res):
|
||||
dbfile = os.path.join(self.basedir,
|
||||
"client0", "private", "backupdb.sqlite")
|
||||
self.failUnless(os.path.exists(dbfile), dbfile)
|
||||
bdb = backupdb.get_backupdb(dbfile)
|
||||
bdb.cursor.execute("UPDATE last_upload SET last_checked=0")
|
||||
bdb.connection.commit()
|
||||
|
||||
d.addCallback(_reset_last_checked)
|
||||
|
||||
d.addCallback(lambda res:
|
||||
self.do_cli("backup", "--verbose", source, "tahoe:backups"))
|
||||
def _check4b((rc, out, err)):
|
||||
# we should check all files, and re-use all of them. None of
|
||||
# the directories should have been changed.
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
fu, fr, dc, dr = self.count_output(out)
|
||||
fchecked, dchecked, dread = self.count_output2(out)
|
||||
self.failUnlessEqual(fchecked, 3)
|
||||
self.failUnlessEqual(fu, 0)
|
||||
self.failUnlessEqual(fr, 3)
|
||||
# TODO: backupdb doesn't do dirs yet; when it does, this will
|
||||
# change to dchecked=4, and maybe dread=0
|
||||
self.failUnlessEqual(dchecked, 0)
|
||||
self.failUnlessEqual(dread, 4)
|
||||
self.failUnlessEqual(dc, 0)
|
||||
self.failUnlessEqual(dr, 4)
|
||||
d.addCallback(_check4b)
|
||||
|
||||
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
|
||||
def _check5((rc, out, err)):
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
self.new_archives = out.split()
|
||||
self.failUnlessEqual(len(self.new_archives), 2)
|
||||
self.failUnlessEqual(len(self.new_archives), 3)
|
||||
# the original backup should still be the oldest (i.e. sorts
|
||||
# alphabetically towards the beginning)
|
||||
self.failUnlessEqual(sorted(self.new_archives)[0],
|
||||
self.old_archives[0])
|
||||
d.addCallback(_check5)
|
||||
@ -701,12 +774,27 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
|
||||
self.writeto("empty", "imagine nothing being here")
|
||||
return self.do_cli("backup", source, "tahoe:backups")
|
||||
d.addCallback(_modify)
|
||||
def _check5a((rc, out, err)):
|
||||
# second backup should reuse bar.txt (if backupdb is available),
|
||||
# and upload the rest. None of the directories can be reused.
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
if have_bdb:
|
||||
fu, fr, dc, dr = self.count_output(out)
|
||||
# new foo.txt, surprise file, subfile, empty
|
||||
self.failUnlessEqual(fu, 4)
|
||||
# old bar.txt
|
||||
self.failUnlessEqual(fr, 1)
|
||||
# home, parent, subdir, blah.txt, surprisedir
|
||||
self.failUnlessEqual(dc, 5)
|
||||
self.failUnlessEqual(dr, 0)
|
||||
d.addCallback(_check5a)
|
||||
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
|
||||
def _check6((rc, out, err)):
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
self.new_archives = out.split()
|
||||
self.failUnlessEqual(len(self.new_archives), 3)
|
||||
self.failUnlessEqual(len(self.new_archives), 4)
|
||||
self.failUnlessEqual(sorted(self.new_archives)[0],
|
||||
self.old_archives[0])
|
||||
d.addCallback(_check6)
|
||||
@ -724,5 +812,19 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
|
||||
self.failUnlessEqual(out, "foo")
|
||||
d.addCallback(_check8)
|
||||
|
||||
d.addCallback(lambda res:
|
||||
self.do_cli("backup", "--no-backupdb", source, "tahoe:backups"))
|
||||
def _check9((rc, out, err)):
|
||||
# --no-backupdb means re-upload everything. We still get to
|
||||
# re-use the directories, since nothing changed.
|
||||
self.failUnlessEqual(err, "")
|
||||
self.failUnlessEqual(rc, 0)
|
||||
fu, fr, dc, dr = self.count_output(out)
|
||||
self.failUnlessEqual(fu, 5)
|
||||
self.failUnlessEqual(fr, 0)
|
||||
self.failUnlessEqual(dc, 0)
|
||||
self.failUnlessEqual(dr, 5)
|
||||
d.addCallback(_check9)
|
||||
|
||||
return d
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user