From 54c2eb333a65d8b16f7edca0b7d0909f76088829 Mon Sep 17 00:00:00 2001 From: Steve Slaven Date: Fri, 24 Jul 2009 14:15:23 -0700 Subject: Testing out using zip files for chunk storage to eliminate some of the huge overhead of having thousands of tiny files and directories diff --git a/fusearchive.py b/fusearchive.py index 4a860ee..faf465a 100755 --- a/fusearchive.py +++ b/fusearchive.py @@ -8,7 +8,7 @@ # See the file COPYING. # -import os, sys, fcntl, fuse, sha, cPickle, gzip, errno +import os, sys, fcntl, fuse, sha, cPickle, gzip, errno, zipfile from fuse import Fuse import pdb @@ -42,9 +42,15 @@ def dmsg(level,message): if level <= debug_level: print str(level) + ": " + str(message) +def save_chunk( chunk ): + return _save_chunk_zip( chunk ) + +def load_chunk( key ): + return _load_chunk_zip( key ) + # This will write out a data block, it will return a key that can get this # data back later -def save_chunk( chunk ): +def _save_chunk_fs( chunk ): if magic_profiling: return( [ 0, 0 ] ) @@ -107,8 +113,68 @@ def save_chunk( chunk ): dmsg( 3, "Got chunk slot: " + str( sub ) ) return( [ digest, sub ] ) +def _save_chunk_zip( chunk ): + if magic_profiling: + return( [ 0, 0 ] ) + + dmsg( 2, "Begin save_chunk, length: " + str( len( chunk ) ) ) + if debug_level > 4: + dmsg( 5, "Chunk: " + str( chunk ) ) + + # Save this hash string, similar to the backuppc algo + digest = sha.new( chunk ).digest() + + # Write out our chunk + chars = list( digest ) + dmsg( 4, chars ) + + # We make the hexdigest here, yeah we could just call hexdigest() + # but we need to essentially do this same thing to reassemble the + # file anyway + hexdigest = ''.join( [ "%02x" % ord( x ) for x in chars ] ) + + # Should be about max of 32k zip files + zipname = hexdigest[ 0:4 ] + ".zip" + dmsg( 3, "Zip name: " + zipname ) + if not os.path.exists( "./storage/" + zipname ): + dmsg( 3, "Creating intial empty zip" ) + z = zipfile.ZipFile( "./storage/" + zipname, 'w', zipfile.ZIP_DEFLATED, True ) + # append mode throws an exception if it's not zip, or maybe it's + # just zero-length files + z.writestr( 'junk', 'junk' ) + z.close() + + z = zipfile.ZipFile( "./storage/" + zipname, 'a', zipfile.ZIP_DEFLATED, True ) + + # Find a chunk slot + sub = 0 + while True: + checkpath = hexdigest + "_" + str( sub ) + dmsg( 3, "Checking: " + checkpath ) + try: + data = z.read( checkpath ) + except: + data = '' + + if len(data): + if data == chunk: + dmsg( 3, "Found existing block" ) + break + else: + dmsg( 3, "Block exists but is not the same" ) + sub += 1 + else: + # We found a spot, dump our data here + dmsg( 3, "No block here, creating new block" ) + z.writestr( checkpath, chunk ) + break + + z.close() + dmsg( 3, "Got chunk slot: " + str( sub ) ) + return( [ digest, sub ] ) + # This will return a data block by key that was saved previously -def load_chunk( key ): +def _load_chunk_fs( key ): if magic_profiling: return '' @@ -142,6 +208,40 @@ def load_chunk( key ): return chunk +def _load_chunk_zip( key ): + if magic_profiling: + return '' + + ( thash, seq ) = key + dmsg( 2, "Begin load_chunk" ) + + chars = list( thash ) + dmsg( 4, chars ) + + # Todo: make a digest -> path function to share with deflate + hexdigest = ''.join( [ "%02x" % ord( x ) for x in chars ] ) + + zipname = hexdigest[ 0:4 ] + ".zip" + dmsg( 3, "Zip name: " + zipname ) + z = zipfile.ZipFile( "./storage/" + zipname, 'r', zipfile.ZIP_DEFLATED, True ) + + subpath = hexdigest + "_" + str( seq ) + dmsg( 3, "Chunk path: " + subpath ) + data = z.read( subpath ) + if len( data ): + dmsg( 3, "Exporting chunk" ) + chunk = data + else: + z.close() + raise IOError + + if debug_level > 4: + dmsg( 5, "Load-Chunk: " + str( chunk ) ) + + z.close() + return chunk + + class FuseArchiveStream: """This just allows switching out writer classes easily""" @staticmethod -- cgit v0.10.2