import os.path import time import urllib import json import datetime from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS, \ UnknownAliasError from allmydata.scripts.common_http import do_http, HTTPError, format_http_error from allmydata.util import time_format from allmydata.scripts import backupdb from allmydata.util.encodingutil import listdir_unicode, quote_output, \ quote_local_unicode_path, to_str, FilenameEncodingError, unicode_to_url from allmydata.util.assertutil import precondition from allmydata.util.fileutil import abspath_expanduser_unicode, precondition_abspath def get_local_metadata(path): metadata = {} # posix stat(2) metadata, depends on the platform os.stat_float_times(True) s = os.stat(path) metadata["ctime"] = s.st_ctime metadata["mtime"] = s.st_mtime misc_fields = ("st_mode", "st_ino", "st_dev", "st_uid", "st_gid") macos_misc_fields = ("st_rsize", "st_creator", "st_type") for field in misc_fields + macos_misc_fields: if hasattr(s, field): metadata[field] = getattr(s, field) # TODO: extended attributes, like on OS-X's HFS+ return metadata def mkdir(contents, options): kids = dict([ (childname, (contents[childname][0], {"ro_uri": contents[childname][1], "metadata": contents[childname][2], })) for childname in contents ]) body = json.dumps(kids).encode("utf-8") url = options['node-url'] + "uri?t=mkdir-immutable" resp = do_http("POST", url, body) if resp.status < 200 or resp.status >= 300: raise HTTPError("Error during mkdir", resp) dircap = to_str(resp.read().strip()) return dircap def put_child(dirurl, childname, childcap): assert dirurl[-1] == "/" url = dirurl + urllib.quote(unicode_to_url(childname)) + "?t=uri" resp = do_http("PUT", url, childcap) if resp.status not in (200, 201): raise HTTPError("Error during put_child", resp) class BackerUpper: """ :ivar int _files_checked: The number of files which the backup process has so-far inspected on the grid to determine if they need to be re-uploaded. :ivar int _directories_checked: The number of directories which the backup process has so-far inspected on the grid to determine if they need to be re-uploaded. """ def __init__(self, options): self.options = options self._files_checked = 0 self._directories_checked = 0 def run(self): options = self.options nodeurl = options['node-url'] self.verbosity = 1 if options['quiet']: self.verbosity = 0 if options['verbose']: self.verbosity = 2 stdout = options.stdout stderr = options.stderr start_timestamp = datetime.datetime.now() bdbfile = os.path.join(options["node-directory"], "private", "backupdb.sqlite") bdbfile = abspath_expanduser_unicode(bdbfile) self.backupdb = backupdb.get_backupdb(bdbfile, stderr) if not self.backupdb: print >>stderr, "ERROR: Unable to load backup db." return 1 try: rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS) except UnknownAliasError as e: e.display(stderr) return 1 to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap) if path: to_url += escape_path(path) if not to_url.endswith("/"): to_url += "/" archives_url = to_url + "Archives/" # first step: make sure the target directory exists, as well as the # Archives/ subdirectory. resp = do_http("GET", archives_url + "?t=json") if resp.status == 404: resp = do_http("POST", archives_url + "?t=mkdir") if resp.status != 200: print >>stderr, format_http_error("Unable to create target directory", resp) return 1 # second step: process the tree targets = list(collect_backup_targets( options.from_dir, listdir_unicode, self.options.filter_listdir, )) completed = run_backup( warn=self.warn, upload_file=self.upload, upload_directory=self.upload_directory, targets=targets, start_timestamp=start_timestamp, stdout=stdout, ) new_backup_dircap = completed.dircap # third: attach the new backup to the list now = time_format.iso_utc(int(time.time()), sep="_") + "Z" put_child(archives_url, now, new_backup_dircap) put_child(to_url, "Latest", new_backup_dircap) print >>stdout, completed.report( self.verbosity, self._files_checked, self._directories_checked, ) # The command exits with code 2 if files or directories were skipped if completed.any_skips(): return 2 # done! return 0 def verboseprint(self, msg): precondition(isinstance(msg, str), msg) if self.verbosity >= 2: print >>self.options.stdout, msg def warn(self, msg): precondition(isinstance(msg, str), msg) print >>self.options.stderr, msg def upload_directory(self, path, compare_contents, create_contents): must_create, r = self.check_backupdb_directory(compare_contents) if must_create: self.verboseprint(" creating directory for %s" % quote_local_unicode_path(path)) newdircap = mkdir(create_contents, self.options) assert isinstance(newdircap, str) if r: r.did_create(newdircap) return True, newdircap else: self.verboseprint(" re-using old directory for %s" % quote_local_unicode_path(path)) return False, r.was_created() def check_backupdb_file(self, childpath): if not self.backupdb: return True, None use_timestamps = not self.options["ignore-timestamps"] r = self.backupdb.check_file(childpath, use_timestamps) if not r.was_uploaded(): return True, r if not r.should_check(): # the file was uploaded or checked recently, so we can just use # it return False, r # we must check the file before using the results filecap = r.was_uploaded() self.verboseprint("checking %s" % quote_output(filecap)) nodeurl = self.options['node-url'] checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap) self._files_checked += 1 resp = do_http("POST", checkurl) if resp.status != 200: # can't check, so we must assume it's bad return True, r cr = json.loads(resp.read()) healthy = cr["results"]["healthy"] if not healthy: # must upload return True, r # file is healthy, no need to upload r.did_check_healthy(cr) return False, r def check_backupdb_directory(self, compare_contents): if not self.backupdb: return True, None r = self.backupdb.check_directory(compare_contents) if not r.was_created(): return True, r if not r.should_check(): # the file was uploaded or checked recently, so we can just use # it return False, r # we must check the directory before re-using it dircap = r.was_created() self.verboseprint("checking %s" % quote_output(dircap)) nodeurl = self.options['node-url'] checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(dircap) self._directories_checked += 1 resp = do_http("POST", checkurl) if resp.status != 200: # can't check, so we must assume it's bad return True, r cr = json.loads(resp.read()) healthy = cr["results"]["healthy"] if not healthy: # must create return True, r # directory is healthy, no need to upload r.did_check_healthy(cr) return False, r # This function will raise an IOError exception when called on an unreadable file def upload(self, childpath): precondition_abspath(childpath) #self.verboseprint("uploading %s.." % quote_local_unicode_path(childpath)) metadata = get_local_metadata(childpath) # we can use the backupdb here must_upload, bdb_results = self.check_backupdb_file(childpath) if must_upload: self.verboseprint("uploading %s.." % quote_local_unicode_path(childpath)) infileobj = open(childpath, "rb") url = self.options['node-url'] + "uri" resp = do_http("PUT", url, infileobj) if resp.status not in (200, 201): raise HTTPError("Error during file PUT", resp) filecap = resp.read().strip() self.verboseprint(" %s -> %s" % (quote_local_unicode_path(childpath, quotemarks=False), quote_output(filecap, quotemarks=False))) #self.verboseprint(" metadata: %s" % (quote_output(metadata, quotemarks=False),)) if bdb_results: bdb_results.did_upload(filecap) return True, filecap, metadata else: self.verboseprint("skipping %s.." % quote_local_unicode_path(childpath)) return False, bdb_results.was_uploaded(), metadata def backup(options): bu = BackerUpper(options) return bu.run() def collect_backup_targets(root, listdir, filter_children): """ Yield BackupTargets in a suitable order for processing (deepest targets before their parents). """ try: children = listdir(root) except EnvironmentError: yield PermissionDeniedTarget(root, isdir=True) except FilenameEncodingError: yield FilenameUndecodableTarget(root, isdir=True) else: for child in filter_children(children): assert isinstance(child, unicode), child childpath = os.path.join(root, child) if os.path.islink(childpath): yield LinkTarget(childpath, isdir=False) elif os.path.isdir(childpath): child_targets = collect_backup_targets( childpath, listdir, filter_children, ) for child_target in child_targets: yield child_target elif os.path.isfile(childpath): yield FileTarget(childpath) else: yield SpecialTarget(childpath) yield DirectoryTarget(root) def run_backup( warn, upload_file, upload_directory, targets, start_timestamp, stdout, ): progress = BackupProgress(warn, start_timestamp, len(targets)) for target in targets: # Pass in the progress and get back a progress. It would be great if # progress objects were immutable. Then the target's backup would # make a new progress with the desired changes and return it to us. # Currently, BackupProgress is mutable, though, and everything just # mutates it. progress = target.backup(progress, upload_file, upload_directory) print >>stdout, progress.report(datetime.datetime.now()) return progress.backup_finished() class FileTarget(object): def __init__(self, path): self._path = path def __repr__(self): return "".format(self._path) def backup(self, progress, upload_file, upload_directory): try: created, childcap, metadata = upload_file(self._path) except EnvironmentError: target = PermissionDeniedTarget(self._path, isdir=False) return target.backup(progress, upload_file, upload_directory) else: assert isinstance(childcap, str) if created: return progress.created_file(self._path, childcap, metadata) return progress.reused_file(self._path, childcap, metadata) class DirectoryTarget(object): def __init__(self, path): self._path = path def __repr__(self): return "".format(self._path) def backup(self, progress, upload_file, upload_directory): metadata = get_local_metadata(self._path) progress, create, compare = progress.consume_directory(self._path) did_create, dircap = upload_directory(self._path, compare, create) if did_create: return progress.created_directory(self._path, dircap, metadata) return progress.reused_directory(self._path, dircap, metadata) class _ErrorTarget(object): def __init__(self, path, isdir): self._path = path self._quoted_path = quote_local_unicode_path(path) self._isdir = isdir class PermissionDeniedTarget(_ErrorTarget): def backup(self, progress, upload_file, upload_directory): return progress.permission_denied(self._isdir, self._quoted_path) class FilenameUndecodableTarget(_ErrorTarget): def backup(self, progress, upload_file, upload_directory): return progress.decoding_failed(self._isdir, self._quoted_path) class LinkTarget(_ErrorTarget): def backup(self, progress, upload_file, upload_directory): return progress.unsupported_filetype( self._isdir, self._quoted_path, "symlink", ) class SpecialTarget(_ErrorTarget): def backup(self, progress, upload_file, upload_directory): return progress.unsupported_filetype( self._isdir, self._quoted_path, "special", ) class BackupComplete(object): def __init__(self, start_timestamp, end_timestamp, files_created, files_reused, files_skipped, directories_created, directories_reused, directories_skipped, dircap, ): self._start_timestamp = start_timestamp self._end_timestamp = end_timestamp self._files_created = files_created self._files_reused = files_reused self._files_skipped = files_skipped self._directories_created = directories_created self._directories_reused = directories_reused self._directories_skipped = directories_skipped self.dircap = dircap def any_skips(self): return self._files_skipped or self._directories_skipped def report(self, verbosity, files_checked, directories_checked): result = [] if verbosity >= 1: result.append( " %d files uploaded (%d reused)," " %d files skipped," " %d directories created (%d reused)," " %d directories skipped" % ( self._files_created, self._files_reused, self._files_skipped, self._directories_created, self._directories_reused, self._directories_skipped, ), ) if verbosity >= 2: result.append( " %d files checked, %d directories checked" % ( files_checked, directories_checked, ), ) # calc elapsed time, omitting microseconds elapsed_time = str( self._end_timestamp - self._start_timestamp ).split('.')[0] result.append(" backup done, elapsed time: %s" % (elapsed_time,)) return "\n".join(result) class BackupProgress(object): # Would be nice if this data structure were immutable and its methods were # transformations that created a new slightly different object. Not there # yet, though. def __init__(self, warn, start_timestamp, target_count): self._warn = warn self._start_timestamp = start_timestamp self._target_count = target_count self._files_created = 0 self._files_reused = 0 self._files_skipped = 0 self._directories_created = 0 self._directories_reused = 0 self._directories_skipped = 0 self.last_dircap = None self._create_contents = {} self._compare_contents = {} def report(self, now): report_format = ( "Backing up {target_progress}/{target_total}... {elapsed} elapsed..." ) return report_format.format( target_progress=( self._files_created + self._files_reused + self._files_skipped + self._directories_created + self._directories_reused + self._directories_skipped ), target_total=self._target_count, elapsed=self._format_elapsed(now - self._start_timestamp), ) def _format_elapsed(self, elapsed): seconds = elapsed.total_seconds() hours = int(seconds / 3600) minutes = int(seconds / 60 % 60) seconds = int(seconds % 60) return "{}h {}m {}s".format( hours, minutes, seconds, ) def backup_finished(self): end_timestamp = datetime.datetime.now() return BackupComplete( self._start_timestamp, end_timestamp, self._files_created, self._files_reused, self._files_skipped, self._directories_created, self._directories_reused, self._directories_skipped, self.last_dircap, ) def consume_directory(self, dirpath): return self, { os.path.basename(create_path): create_value for (create_path, create_value) in self._create_contents.iteritems() if os.path.dirname(create_path) == dirpath }, { os.path.basename(compare_path): compare_value for (compare_path, compare_value) in self._compare_contents.iteritems() if os.path.dirname(compare_path) == dirpath } def created_directory(self, path, dircap, metadata): self._create_contents[path] = ("dirnode", dircap, metadata) self._compare_contents[path] = dircap self._directories_created += 1 self.last_dircap = dircap return self def reused_directory(self, path, dircap, metadata): self._create_contents[path] = ("dirnode", dircap, metadata) self._compare_contents[path] = dircap self._directories_reused += 1 self.last_dircap = dircap return self def created_file(self, path, cap, metadata): self._create_contents[path] = ("filenode", cap, metadata) self._compare_contents[path] = cap self._files_created += 1 return self def reused_file(self, path, cap, metadata): self._create_contents[path] = ("filenode", cap, metadata) self._compare_contents[path] = cap self._files_reused += 1 return self def permission_denied(self, isdir, quoted_path): return self._skip( "WARNING: permission denied on {kind} {path}", isdir, path=quoted_path, ) def decoding_failed(self, isdir, quoted_path): return self._skip( "WARNING: could not list {kind} {path} due to a filename encoding error", isdir, path=quoted_path, ) def unsupported_filetype(self, isdir, quoted_path, filetype): return self._skip( "WARNING: cannot backup {filetype} {path}", isdir, path=quoted_path, filetype=filetype, ) def _skip(self, message, isdir, **kw): if isdir: self._directories_skipped += 1 kind = "directory" else: self._files_skipped += 1 kind = "file" self._warn(message.format(kind=kind, **kw)) # Pretend we're a persistent data structure being transformed. return self