add sqlite-based backupdb, for #598 and others (including 'tahoe cp'). Not enabled yet.

This commit is contained in:
Brian Warner 2009-02-05 18:17:56 -07:00
parent 9ce226b4d8
commit 4be2e11d11
2 changed files with 391 additions and 0 deletions

View File

@ -0,0 +1,246 @@
# the backupdb is only available if sqlite3 is available. Python-2.5.x and
# beyond include sqlite3 in the standard library. For python-2.4, the
# "pysqlite2" package (which, despite the confusing name, uses sqlite3) must
# be installed. On debian, install python-pysqlite2
import os.path, sys, time, random, stat
DAY = 24*60*60
MONTH = 30*DAY
SCHEMA_v1 = """
CREATE TABLE version
(
version INTEGER -- contains one row, set to 1
);
CREATE TABLE local_files
(
path VARCHAR(1024) PRIMARY KEY, -- index, this is os.path.abspath(fn)
size INTEGER, -- os.stat(fn)[stat.ST_SIZE]
mtime NUMBER, -- os.stat(fn)[stat.ST_MTIME]
ctime NUMBER, -- os.stat(fn)[stat.ST_CTIME]
fileid INTEGER
);
CREATE TABLE caps
(
fileid INTEGER PRIMARY KEY AUTOINCREMENT,
filecap VARCHAR(256) UNIQUE -- URI:CHK:...
);
CREATE TABLE last_upload
(
fileid INTEGER PRIMARY KEY,
last_uploaded TIMESTAMP,
last_checked TIMESTAMP
);
"""
def get_backupdb(dbfile, stderr=sys.stderr):
# open or create the given backupdb file. The parent directory must
# exist.
try:
import sqlite3
sqlite = sqlite3 # pyflakes whines about 'import sqlite3 as sqlite' ..
except ImportError:
try:
from pysqlite2 import dbapi2
sqlite = dbapi2 # .. when this clause does it too
except ImportError:
print >>stderr, "sqlite unavailable, not using backupdb"
return None
must_create = not os.path.exists(dbfile)
try:
db = sqlite.connect(dbfile)
except (EnvironmentError, sqlite.OperationalError), e:
print >>stderr, "Unable to create/open backupdb file %s: %s" % (dbfile, e)
return None
c = db.cursor()
if must_create:
c.executescript(SCHEMA_v1)
c.execute("INSERT INTO version (version) VALUES (1)")
db.commit()
try:
c.execute("SELECT version FROM version")
version = c.fetchone()[0]
except sqlite.DatabaseError, e:
# this indicates that the file is not a compatible database format.
# Perhaps it was created with an old version, or it might be junk.
print >>stderr, "backupdb file is unusable: %s" % e
return None
if version == 1:
return BackupDB_v1(sqlite, db)
print >>stderr, "Unable to handle backupdb version %s" % version
return None
MUST_UPLOAD, ALREADY_UPLOADED = range(2)
class Result:
def __init__(self, bdb, filecap, should_check,
path, mtime, ctime, size):
self.bdb = bdb
self.filecap = filecap
self.should_check_p = should_check
self.path = path
self.mtime = mtime
self.ctime = ctime
self.size = size
def was_uploaded(self):
if self.filecap:
return self.filecap
return False
def did_upload(self, filecap):
self.bdb.did_upload(filecap,
self.path,
self.mtime, self.ctime, self.size)
def should_check(self):
return self.should_check_p
def did_check_healthy(self, results):
self.bdb.did_check_healthy(self.filecap, results)
class BackupDB_v1:
VERSION = 1
NO_CHECK_BEFORE = 1*MONTH
ALWAYS_CHECK_AFTER = 2*MONTH
def __init__(self, sqlite_module, connection):
self.sqlite_module = sqlite_module
self.connection = connection
self.cursor = connection.cursor()
def check_file(self, path, use_timestamps=True):
"""I will tell you if a given local file needs to be uploaded or not,
by looking in a database and seeing if I have a record of this file
having been uploaded earlier.
I return a Results object, synchronously. If r.was_uploaded() returns
False, you should upload the file. When you are finished uploading
it, call r.did_upload(filecap), so I can update my database.
If was_uploaded() returns a filecap, you might be able to avoid an
upload. Call r.must_check(), and if it says False, you can skip the
upload and use the filecap returned by was_uploaded().
If should_check() returns True, you should perform a filecheck on the
filecap returned by was_uploaded(). If the check indicates the file
is healthy, please call r.did_check_healthy(checker_results) so I can
update the database. If the check indicates the file is not healthy,
please upload the file and call r.did_upload(filecap) when you're
done.
I use_timestamps=True (the default), I will compare ctime and mtime
of the local file against an entry in my database, and consider the
file to be unchanged if ctime, mtime, and filesize are all the same
as the earlier version. If use_timestamps=False, I will not trust the
timestamps, so more files (perhaps all) will be marked as needing
upload. A future version of this database may hash the file to make
equality decisions, in which case use_timestamps=False will not
always imply r.must_upload()==True.
'path' points to a local file on disk, possibly relative to the
current working directory. The database stores absolute pathnames.
"""
path = os.path.abspath(path)
s = os.stat(path)
size = s[stat.ST_SIZE]
ctime = s[stat.ST_CTIME]
mtime = s[stat.ST_MTIME]
now = time.time()
c = self.cursor
c.execute("SELECT size,mtime,ctime,fileid"
" FROM local_files"
" WHERE path=?",
(path,))
row = self.cursor.fetchone()
if not row:
return Result(self, None, False, path, mtime, ctime, size)
(last_size,last_mtime,last_ctime,last_fileid) = row
c.execute("SELECT caps.filecap, last_upload.last_checked"
" FROM caps,last_upload"
" WHERE caps.fileid=? AND last_upload.fileid=?",
(last_fileid, last_fileid))
row2 = c.fetchone()
if ((last_size != size
or not use_timestamps
or last_mtime != mtime
or last_ctime != ctime) # the file has been changed
or (not row2) # we somehow forgot where we put the file last time
):
c.execute("DELETE FROM local_files WHERE path=?", (path,))
self.connection.commit()
return Result(self, None, False, path, mtime, ctime, size)
# at this point, we're allowed to assume the file hasn't been changed
(filecap, last_checked) = row2
age = now - last_checked
probability = ((age - self.NO_CHECK_BEFORE) /
(self.ALWAYS_CHECK_AFTER - self.NO_CHECK_BEFORE))
probability = min(max(probability, 0.0), 1.0)
should_check = bool(random.random() < probability)
return Result(self, filecap, should_check, path, mtime, ctime, size)
def get_or_allocate_fileid_for_cap(self, filecap):
# find an existing fileid for this filecap, or insert a new one. The
# caller is required to commit() afterwards.
# mysql has "INSERT ... ON DUPLICATE KEY UPDATE", but not sqlite
# sqlite has "INSERT ON CONFLICT REPLACE", but not mysql
# So we use INSERT, ignore any error, then a SELECT
c = self.cursor
try:
c.execute("INSERT INTO caps (filecap) VALUES (?)", (filecap,))
except self.sqlite_module.IntegrityError:
pass
c.execute("SELECT fileid FROM caps WHERE filecap=?", (filecap,))
foundrow = c.fetchone()
assert foundrow
fileid = foundrow[0]
return fileid
def did_upload(self, filecap, path, mtime, ctime, size):
now = time.time()
fileid = self.get_or_allocate_fileid_for_cap(filecap)
try:
self.cursor.execute("INSERT INTO last_upload VALUES (?,?,?)",
(fileid, now, now))
except self.sqlite_module.IntegrityError:
self.cursor.execute("UPDATE last_upload"
" SET last_uploaded=?, last_checked=?"
" WHERE fileid=?",
(now, now, fileid))
try:
self.cursor.execute("INSERT INTO local_files VALUES (?,?,?,?,?)",
(path, size, mtime, ctime, fileid))
except self.sqlite_module.IntegrityError:
self.cursor.execute("UPDATE local_files"
" SET size=?, mtime=?, ctime=?, fileid=?"
" WHERE path=?",
(size, mtime, ctime, fileid, path))
self.connection.commit()
def did_check_healthy(self, filecap, results):
now = time.time()
fileid = self.get_or_allocate_fileid_for_cap(filecap)
self.cursor.execute("UPDATE last_upload"
" SET last_checked=?"
" WHERE fileid=?",
(now, fileid))
self.connection.commit()

View File

@ -0,0 +1,145 @@
import os.path, time
from StringIO import StringIO
from twisted.trial import unittest
from allmydata.util import fileutil
from allmydata.scripts import backupdb
class BackupDB(unittest.TestCase):
def create_or_skip(self, dbfile):
stderr = StringIO()
bdb = backupdb.get_backupdb(dbfile, stderr=stderr)
if not bdb:
if "sqlite unavailable" in stderr.getvalue():
raise unittest.SkipTest("sqlite unavailable, skipping test")
return bdb
def test_basic(self):
self.basedir = basedir = os.path.join("backupdb", "create")
fileutil.make_dirs(basedir)
dbfile = os.path.join(basedir, "dbfile")
bdb = self.create_or_skip(dbfile)
self.failUnless(bdb)
self.failUnlessEqual(bdb.VERSION, 1)
def test_fail(self):
self.basedir = basedir = os.path.join("backupdb", "fail")
fileutil.make_dirs(basedir)
# put a non-DB file in the way
self.writeto("not-a-database", "I do not look like a sqlite database")
stderr_f = StringIO()
bdb = backupdb.get_backupdb(os.path.join(basedir, "not-a-database"),
stderr_f)
self.failUnlessEqual(bdb, None)
stderr = stderr_f.getvalue()
if "sqlite unavailable" in stderr:
pass
else:
self.failUnless("backupdb file is unusable" in stderr)
self.failUnless("file is encrypted or is not a database" in stderr)
# put a directory in the way, to exercise a different error path
where = os.path.join(basedir, "roadblock-dir")
fileutil.make_dirs(where)
stderr_f = StringIO()
bdb = backupdb.get_backupdb(where, stderr_f)
self.failUnlessEqual(bdb, None)
stderr = stderr_f.getvalue()
if "sqlite unavailable" in stderr:
pass
else:
self.failUnless(("Unable to create/open backupdb file %s" % where)
in stderr)
self.failUnless("unable to open database file" in stderr)
def writeto(self, filename, data):
fn = os.path.join(self.basedir, filename)
parentdir = os.path.dirname(fn)
fileutil.make_dirs(parentdir)
f = open(fn, "w")
f.write(data)
f.close()
return fn
def test_check(self):
self.basedir = basedir = os.path.join("backupdb", "check")
fileutil.make_dirs(basedir)
dbfile = os.path.join(basedir, "dbfile")
bdb = self.create_or_skip(dbfile)
self.failUnless(bdb)
foo_fn = self.writeto("foo.txt", "foo.txt")
blah_fn = self.writeto("bar/blah.txt", "blah.txt")
r = bdb.check_file(foo_fn)
self.failUnlessEqual(r.was_uploaded(), False)
r.did_upload("foo-cap")
r = bdb.check_file(blah_fn)
self.failUnlessEqual(r.was_uploaded(), False)
r.did_upload("blah-cap")
r = bdb.check_file(foo_fn)
self.failUnlessEqual(r.was_uploaded(), "foo-cap")
self.failUnlessEqual(r.should_check(), False)
time.sleep(1.0) # make sure the timestamp changes
self.writeto("foo.txt", "NEW")
r = bdb.check_file(foo_fn)
self.failUnlessEqual(r.was_uploaded(), False)
r.did_upload("new-cap")
r = bdb.check_file(foo_fn)
self.failUnlessEqual(r.was_uploaded(), "new-cap")
self.failUnlessEqual(r.should_check(), False)
# if we spontaneously decide to upload it anyways, nothing should
# break
r.did_upload("new-cap")
r = bdb.check_file(foo_fn, use_timestamps=False)
self.failUnlessEqual(r.was_uploaded(), False)
r.did_upload("new-cap")
r = bdb.check_file(foo_fn)
self.failUnlessEqual(r.was_uploaded(), "new-cap")
self.failUnlessEqual(r.should_check(), False)
bdb.NO_CHECK_BEFORE = 0
bdb.ALWAYS_CHECK_AFTER = 0.1
r = bdb.check_file(blah_fn)
self.failUnlessEqual(r.was_uploaded(), "blah-cap")
self.failUnlessEqual(r.should_check(), True)
r.did_check_healthy("results") # we know they're ignored for now
r = bdb.check_file(blah_fn)
self.failUnlessEqual(r.was_uploaded(), "blah-cap")
self.failUnlessEqual(r.should_check(), False)
os.unlink(os.path.join(basedir, "foo.txt"))
fileutil.make_dirs(os.path.join(basedir, "foo.txt")) # file becomes dir
r = bdb.check_file(foo_fn)
self.failUnlessEqual(r.was_uploaded(), False)
def test_wrong_version(self):
self.basedir = basedir = os.path.join("backupdb", "wrong_version")
fileutil.make_dirs(basedir)
where = os.path.join(basedir, "tooold.db")
bdb = self.create_or_skip(where)
# reach into the DB and make it old
bdb.cursor.execute("UPDATE version SET version=0")
bdb.connection.commit()
# now the next time we open the database, it should be an unusable
# version
stderr_f = StringIO()
bdb = backupdb.get_backupdb(where, stderr_f)
self.failUnlessEqual(bdb, None)
stderr = stderr_f.getvalue()
self.failUnlessEqual(stderr.strip(),
"Unable to handle backupdb version 0")