first pass refactoring

now collect backup work up-front instead of mixed with processing
This commit is contained in:
Jean-Paul Calderone 2018-03-13 21:04:14 -04:00
parent 3bb4c979c8
commit c55d2823ae
2 changed files with 355 additions and 102 deletions

View File

@ -56,20 +56,9 @@ def put_child(dirurl, childname, childcap):
if resp.status not in (200, 201): if resp.status not in (200, 201):
raise HTTPError("Error during put_child", resp) raise HTTPError("Error during put_child", resp)
class BackupProcessingError(Exception):
pass
class BackerUpper: class BackerUpper:
def __init__(self, options): def __init__(self, options):
self.options = options self.options = options
self.files_uploaded = 0
self.files_reused = 0
self.files_checked = 0
self.files_skipped = 0
self.directories_created = 0
self.directories_reused = 0
self.directories_checked = 0
self.directories_skipped = 0
def run(self): def run(self):
options = self.options options = self.options
@ -83,7 +72,6 @@ class BackerUpper:
stderr = options.stderr stderr = options.stderr
start_timestamp = datetime.datetime.now() start_timestamp = datetime.datetime.now()
self.backupdb = None
bdbfile = os.path.join(options["node-directory"], bdbfile = os.path.join(options["node-directory"],
"private", "backupdb.sqlite") "private", "backupdb.sqlite")
bdbfile = abspath_expanduser_unicode(bdbfile) bdbfile = abspath_expanduser_unicode(bdbfile)
@ -94,7 +82,7 @@ class BackerUpper:
try: try:
rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS) rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
except UnknownAliasError, e: except UnknownAliasError as e:
e.display(stderr) e.display(stderr)
return 1 return 1
to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap) to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
@ -115,36 +103,29 @@ class BackerUpper:
return 1 return 1
# second step: process the tree # second step: process the tree
new_backup_dircap = self.process(options.from_dir) targets = list(collect_backup_targets(
options.from_dir,
listdir_unicode,
self.options.filter_listdir,
))
completed = run_backup(
warn=self.warn,
upload_file=self.upload,
upload_directory=self.upload_directory,
targets=targets,
start_timestamp=start_timestamp,
)
new_backup_dircap = completed.dircap
# third: attach the new backup to the list # third: attach the new backup to the list
now = time_format.iso_utc(int(time.time()), sep="_") + "Z" now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
put_child(archives_url, now, new_backup_dircap) put_child(archives_url, now, new_backup_dircap)
put_child(to_url, "Latest", new_backup_dircap) put_child(to_url, "Latest", new_backup_dircap)
end_timestamp = datetime.datetime.now() print >>stdout, completed.report(self.verbosity)
# calc elapsed time, omitting microseconds
elapsed_time = str(end_timestamp - start_timestamp).split('.')[0]
if self.verbosity >= 1:
print >>stdout, (" %d files uploaded (%d reused), "
"%d files skipped, "
"%d directories created (%d reused), "
"%d directories skipped"
% (self.files_uploaded,
self.files_reused,
self.files_skipped,
self.directories_created,
self.directories_reused,
self.directories_skipped))
if self.verbosity >= 2:
print >>stdout, (" %d files checked, %d directories checked"
% (self.files_checked,
self.directories_checked))
print >>stdout, " backup done, elapsed time: %s" % elapsed_time
# The command exits with code 2 if files or directories were skipped # The command exits with code 2 if files or directories were skipped
if self.files_skipped or self.directories_skipped: if completed.any_skips():
return 2 return 2
# done! # done!
@ -159,66 +140,19 @@ class BackerUpper:
precondition(isinstance(msg, str), msg) precondition(isinstance(msg, str), msg)
print >>self.options.stderr, msg print >>self.options.stderr, msg
def process(self, localpath): def upload_directory(self, path, compare_contents, create_contents):
precondition_abspath(localpath)
# returns newdircap
quoted_path = quote_local_unicode_path(localpath)
self.verboseprint("processing %s" % (quoted_path,))
create_contents = {} # childname -> (type, rocap, metadata)
compare_contents = {} # childname -> rocap
try:
children = listdir_unicode(localpath)
except EnvironmentError:
self.directories_skipped += 1
self.warn("WARNING: permission denied on directory %s" % (quoted_path,))
children = []
except FilenameEncodingError:
self.directories_skipped += 1
self.warn("WARNING: could not list directory %s due to a filename encoding error" % (quoted_path,))
children = []
for child in self.options.filter_listdir(children):
assert isinstance(child, unicode), child
childpath = os.path.join(localpath, child)
# note: symlinks to directories are both islink() and isdir()
if os.path.isdir(childpath) and not os.path.islink(childpath):
metadata = get_local_metadata(childpath)
# recurse on the child directory
childcap = self.process(childpath)
assert isinstance(childcap, str)
create_contents[child] = ("dirnode", childcap, metadata)
compare_contents[child] = childcap
elif os.path.isfile(childpath) and not os.path.islink(childpath):
try:
childcap, metadata = self.upload(childpath)
assert isinstance(childcap, str)
create_contents[child] = ("filenode", childcap, metadata)
compare_contents[child] = childcap
except EnvironmentError:
self.files_skipped += 1
self.warn("WARNING: permission denied on file %s" % quote_local_unicode_path(childpath))
else:
self.files_skipped += 1
if os.path.islink(childpath):
self.warn("WARNING: cannot backup symlink %s" % quote_local_unicode_path(childpath))
else:
self.warn("WARNING: cannot backup special file %s" % quote_local_unicode_path(childpath))
must_create, r = self.check_backupdb_directory(compare_contents) must_create, r = self.check_backupdb_directory(compare_contents)
if must_create: if must_create:
self.verboseprint(" creating directory for %s" % quote_local_unicode_path(localpath)) self.verboseprint(" creating directory for %s" % quote_local_unicode_path(path))
newdircap = mkdir(create_contents, self.options) newdircap = mkdir(create_contents, self.options)
assert isinstance(newdircap, str) assert isinstance(newdircap, str)
if r: if r:
r.did_create(newdircap) r.did_create(newdircap)
self.directories_created += 1 return True, newdircap
return newdircap
else: else:
self.verboseprint(" re-using old directory for %s" % quote_local_unicode_path(localpath)) self.verboseprint(" re-using old directory for %s" % quote_local_unicode_path(path))
self.directories_reused += 1 return False, r.was_created()
return r.was_created()
def check_backupdb_file(self, childpath): def check_backupdb_file(self, childpath):
if not self.backupdb: if not self.backupdb:
@ -239,7 +173,6 @@ class BackerUpper:
self.verboseprint("checking %s" % quote_output(filecap)) self.verboseprint("checking %s" % quote_output(filecap))
nodeurl = self.options['node-url'] nodeurl = self.options['node-url']
checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap) checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap)
self.files_checked += 1
resp = do_http("POST", checkurl) resp = do_http("POST", checkurl)
if resp.status != 200: if resp.status != 200:
# can't check, so we must assume it's bad # can't check, so we must assume it's bad
@ -272,7 +205,6 @@ class BackerUpper:
self.verboseprint("checking %s" % quote_output(dircap)) self.verboseprint("checking %s" % quote_output(dircap))
nodeurl = self.options['node-url'] nodeurl = self.options['node-url']
checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(dircap) checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(dircap)
self.directories_checked += 1
resp = do_http("POST", checkurl) resp = do_http("POST", checkurl)
if resp.status != 200: if resp.status != 200:
# can't check, so we must assume it's bad # can't check, so we must assume it's bad
@ -313,14 +245,326 @@ class BackerUpper:
if bdb_results: if bdb_results:
bdb_results.did_upload(filecap) bdb_results.did_upload(filecap)
self.files_uploaded += 1 return True, filecap, metadata
return filecap, metadata
else: else:
self.verboseprint("skipping %s.." % quote_local_unicode_path(childpath)) self.verboseprint("skipping %s.." % quote_local_unicode_path(childpath))
self.files_reused += 1 return False, bdb_results.was_uploaded(), metadata
return bdb_results.was_uploaded(), metadata
def backup(options): def backup(options):
bu = BackerUpper(options) bu = BackerUpper(options)
return bu.run() return bu.run()
def collect_backup_targets(root, listdir, filter_children):
"""
Yield BackupTargets in a suitable order for processing (deepest targets
before their parents).
"""
try:
children = listdir(root)
except EnvironmentError:
yield PermissionDeniedTarget(root, isdir=True)
except FilenameEncodingError:
yield FilenameUndecodableTarget(root, isdir=True)
else:
for child in filter_children(children):
assert isinstance(child, unicode), child
childpath = os.path.join(root, child)
if os.path.islink(childpath):
yield LinkTarget(childpath, isdir=False)
elif os.path.isdir(childpath):
child_targets = collect_backup_targets(
childpath,
listdir,
filter_children,
)
for child_target in child_targets:
yield child_target
elif os.path.isfile(childpath):
yield FileTarget(childpath)
else:
yield SpecialTarget(childpath)
yield DirectoryTarget(root)
def run_backup(
warn,
upload_file,
upload_directory,
targets,
start_timestamp,
):
progress = BackupProgress(warn, start_timestamp)
for target in targets:
# Pass in the progress and get back a progress. It would be great if
# progress objects were immutable. Then the target's backup would
# make a new progress with the desired changes and return it to us.
# Currently, BackupProgress is mutable, though, and everything just
# mutates it.
progress = target.backup(progress, upload_file, upload_directory)
progress.report()
return progress.backup_finished()
class FileTarget(object):
def __init__(self, path):
self._path = path
def __repr__(self):
return "<File {}>".format(self._path)
def backup(self, progress, upload_file, upload_directory):
try:
created, childcap, metadata = upload_file(self._path)
except EnvironmentError:
target = PermissionDeniedTarget(self._path, isdir=False)
return target.backup(progress, upload_file, upload_directory)
else:
assert isinstance(childcap, str)
if created:
return progress.created_file(self._path, childcap, metadata)
return progress.reused_file(self._path, childcap, metadata)
class DirectoryTarget(object):
def __init__(self, path):
self._path = path
def __repr__(self):
return "<Directory {}>".format(self._path)
def backup(self, progress, upload_file, upload_directory):
metadata = get_local_metadata(self._path)
progress, create, compare = progress.consume_directory(self._path)
did_create, dircap = upload_directory(self._path, compare, create)
if did_create:
return progress.created_directory(self._path, dircap, metadata)
return progress.reused_directory(self._path, dircap, metadata)
class _ErrorTarget(object):
def __init__(self, path, isdir):
self._path = path
self._quoted_path = quote_local_unicode_path(path)
self._isdir = isdir
class PermissionDeniedTarget(_ErrorTarget):
def backup(self, progress, upload_file, upload_directory):
return progress.permission_denied(self._isdir, self._quoted_path)
class FilenameUndecodableTarget(_ErrorTarget):
def backup(self, progress, upload_file, upload_directory):
return progress.decoding_failed(self._isdir, self._quoted_path)
class LinkTarget(_ErrorTarget):
def backup(self, progress, upload_file, upload_directory):
return progress.unsupported_filetype(
self._isdir,
self._quoted_path,
"symlink",
)
class SpecialTarget(_ErrorTarget):
def backup(self, progress, upload_file, upload_directory):
return progress.unsupported_filetype(
self._isdir,
self._quoted_path,
"special",
)
class BackupComplete(object):
def __init__(self,
start_timestamp,
end_timestamp,
files_created,
files_reused,
files_skipped,
directories_created,
directories_reused,
directories_skipped,
dircap,
):
self._start_timestamp = start_timestamp
self._end_timestamp = end_timestamp
self._files_created = files_created
self._files_reused = files_reused
self._files_skipped = files_skipped
self._directories_created = directories_created
self._directories_reused = directories_reused
self._directories_skipped = directories_skipped
self.dircap = dircap
def any_skips(self):
return self._files_skipped or self._directories_skipped
def report(self, verbosity):
result = []
if verbosity >= 1:
result.append(
" %d files uploaded (%d reused),"
" %d files skipped,"
" %d directories created (%d reused),"
" %d directories skipped" % (
self._files_created,
self._files_reused,
self._files_skipped,
self._directories_created,
self._directories_reused,
self._directories_skipped,
),
)
if verbosity >= 2:
result.append(
" %d files checked, %d directories checked" % (
self._files_created + self._files_reused,
self._directories_created + self._directories_reused,
),
)
# calc elapsed time, omitting microseconds
elapsed_time = str(
self._end_timestamp - self._start_timestamp
).split('.')[0]
result.append(" backup done, elapsed time: %s" % (elapsed_time,))
return "\n".join(result)
class BackupProgress(object):
# Would be nice if this data structure were immutable and its methods were
# transformations that created a new slightly different object. Not there
# yet, though.
def __init__(self, warn, start_timestamp):
self._warn = warn
self._start_timestamp = start_timestamp
self._files_created = 0
self._files_reused = 0
self._files_skipped = 0
self._directories_created = 0
self._directories_reused = 0
self._directories_skipped = 0
self.last_dircap = None
self._create_contents = {}
self._compare_contents = {}
def report(self):
pass
def backup_finished(self):
end_timestamp = datetime.datetime.now()
return BackupComplete(
self._start_timestamp,
end_timestamp,
self._files_created,
self._files_reused,
self._files_skipped,
self._directories_created,
self._directories_reused,
self._directories_skipped,
self.last_dircap,
)
def consume_directory(self, dirpath):
return self, {
os.path.basename(create_path): create_value
for (create_path, create_value)
in self._create_contents.iteritems()
if os.path.dirname(create_path) == dirpath
}, {
os.path.basename(compare_path): compare_value
for (compare_path, compare_value)
in self._compare_contents.iteritems()
if os.path.dirname(compare_path) == dirpath
}
def created_directory(self, path, dircap, metadata):
self._create_contents[path] = ("dirnode", dircap, metadata)
self._compare_contents[path] = dircap
self._directories_created += 1
self.last_dircap = dircap
return self
def reused_directory(self, path, dircap, metadata):
self._create_contents[path] = ("dirnode", dircap, metadata)
self._compare_contents[path] = dircap
self._directories_reused += 1
self.last_dircap = dircap
return self
def created_file(self, path, cap, metadata):
self._create_contents[path] = ("filenode", cap, metadata)
self._compare_contents[path] = cap
self._files_created += 1
return self
def reused_file(self, path, cap, metadata):
self._create_contents[path] = ("filenode", cap, metadata)
self._compare_contents[path] = cap
self._files_reused += 1
return self
def permission_denied(self, isdir, quoted_path):
return self._skip(
"WARNING: permission denied on {kind} {path}",
isdir,
path=quoted_path,
)
def decoding_failed(self, isdir, quoted_path):
return self._skip(
"WARNING: could not list {kind} {path} due to a filename encoding error",
isdir,
path=quoted_path,
)
def unsupported_filetype(self, isdir, quoted_path, filetype):
return self._skip(
"WARNING: cannot backup {filetype} {path}",
isdir,
path=quoted_path,
filetype=filetype,
)
def _skip(self, message, isdir, **kw):
if isdir:
self._directories_skipped += 1
kind = "directory"
else:
self._files_skipped += 1
kind = "file"
self._warn(message.format(kind=kind, **kw))
# Pretend we're a persistent data structure being transformed.
return self

View File

@ -64,19 +64,28 @@ class Backup(GridTestMixin, CLITestMixin, StallMixin, unittest.TestCase):
d = self.do_cli("create-alias", "tahoe") d = self.do_cli("create-alias", "tahoe")
d.addCallback(lambda res: do_backup()) d.addCallback(lambda res: do_backup(True))
def _check0((rc, out, err)): def _check0((rc, out, err)):
print()
print(out)
self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(err, "")
self.failUnlessReallyEqual(rc, 0) self.failUnlessReallyEqual(rc, 0)
fu, fr, fs, dc, dr, ds = self.count_output(out) (
files_uploaded,
files_reused,
files_skipped,
directories_created,
directories_reused,
directories_skipped,
) = self.count_output(out)
# foo.txt, bar.txt, blah.txt # foo.txt, bar.txt, blah.txt
self.failUnlessReallyEqual(fu, 3) self.failUnlessReallyEqual(files_uploaded, 3)
self.failUnlessReallyEqual(fr, 0) self.failUnlessReallyEqual(files_reused, 0)
self.failUnlessReallyEqual(fs, 0) self.failUnlessReallyEqual(files_skipped, 0)
# empty, home, home/parent, home/parent/subdir # empty, home, home/parent, home/parent/subdir
self.failUnlessReallyEqual(dc, 4) self.failUnlessReallyEqual(directories_created, 4)
self.failUnlessReallyEqual(dr, 0) self.failUnlessReallyEqual(directories_reused, 0)
self.failUnlessReallyEqual(ds, 0) self.failUnlessReallyEqual(directories_skipped, 0)
d.addCallback(_check0) d.addCallback(_check0)
d.addCallback(lambda res: self.do_cli("ls", "--uri", "tahoe:backups")) d.addCallback(lambda res: self.do_cli("ls", "--uri", "tahoe:backups"))