From 16b950060f30fd1793608d2b14f2de235b0d8c83 Mon Sep 17 00:00:00 2001 From: Steve Slaven Date: Thu, 6 Aug 2009 22:39:44 -0700 Subject: Store a key in the phyiscal file that pulls the data from a chunk to minimize 100% file duplicates to be nothing more than a key length diff --git a/FuseArchive/ChunkFile.py b/FuseArchive/ChunkFile.py index e713e10..7cc8e07 100644 --- a/FuseArchive/ChunkFile.py +++ b/FuseArchive/ChunkFile.py @@ -92,6 +92,7 @@ class ChunkFile(object): # fflush early if we're creating a new file since we reference this # attribute in the routine. At least it gets initialized I guess self.original_chunks = [] + self.original_key = None # TODO: Better flag handling here? if flags & os.O_RDONLY: @@ -131,9 +132,19 @@ class ChunkFile(object): try: magic = Serializer.loadfh( self.file ) logging.debug( "Got data: %s" % magic ) + + # This is just a key to a block to minimize complete + # duplicates + logging.debug( "Reading chunk to get actual file data" ) + self.original_key = magic + file_chunk = load_chunk( magic ) + magic = Serializer.loads( file_chunk ) + self.size = magic[ 'size' ] self.chunks = magic[ 'chunks' ] self.chunk_size = magic[ 'chunk_size' ] + logging.debug( "Loaded size: %d, chunk size: %d, chunks: %d" % + ( self.size, self.chunk_size, len( self.chunks ) ) ) except Exception, e: logging.critical( self.orig_path + ": " + str( e ) ) else: @@ -400,11 +411,24 @@ class ChunkFile(object): + "\nProbably a bug in write or ftruncate!" logging.debug( "Size calculated is: %d (0x%x)" % ( self.size, self.size ) ) - Serializer.dumpfh( self.file, { + key = save_chunk( Serializer.dumps( { 'size': self.size, 'chunks': self.chunks, 'chunk_size': self.chunk_size - } ) + } ) ) + + logging.debug( "Saved indirect file to key %s, saving key in main file" % key ) + + Serializer.dumpfh( self.file, key ) + + # Update file ref counts + if key != self.original_key: + logging.debug( "File key changed updating references" ) + if self.original_key != None: + unlock_chunk( self.original_key ) + + lock_chunk( key ) + self.original_key = key # Now update our chunk ref counts logging.debug( "Updating chunk references" ) diff --git a/FuseArchive/Serializer.py b/FuseArchive/Serializer.py index b23371c..621d90a 100644 --- a/FuseArchive/Serializer.py +++ b/FuseArchive/Serializer.py @@ -17,11 +17,15 @@ class Serializer: fh.seek( 0 ) f = gzip.GzipFile( None, "wb", gzip_compress_level, fh ) #f = fh - cPickle.dump( obj, f, -1 ) + f.write( Serializer.dumps( obj ) ) del f fh.flush() @staticmethod + def dumps( obj ): + return cPickle.dumps( obj, -1 ) + + @staticmethod def load( f ): if FuseArchive.magic_profiling: return { 'size': 0, 'chunks': 0, 'chunk_size': 0 } @@ -37,6 +41,9 @@ class Serializer: fh.seek( 0 ) f = gzip.GzipFile( None, "rb", gzip_compress_level, fh ) #f = fh - magic = cPickle.load( f ) + magic = Serializer.loads( f.read() ) return( magic ) + @staticmethod + def loads( str ): + return cPickle.loads( str ) -- cgit v0.10.2