From b4f754a596f8d262d0f3089c37bf47c73d8ccfc1 Mon Sep 17 00:00:00 2001 From: Steve Slaven Date: Mon, 2 Nov 2009 15:36:12 -0800 Subject: Use a stringbuffer list instead of appending strings, apparently python string handling when appending/using large strings is very bad diff --git a/FuseArchive/ChunkBuffer.py b/FuseArchive/ChunkBuffer.py new file mode 100644 index 0000000..4ef6370 --- /dev/null +++ b/FuseArchive/ChunkBuffer.py @@ -0,0 +1,21 @@ +import logging + +# Handle efficient operations on a non-fixed length buffer like appending, +# replacing, reading chunks, etc +class ChunkBuffer: + def __init__( self, data = '' ): + logging.debug( "Creating chunkbuffer: %s" % data ) + self.chunk = list( data ) + + def append( self, s ): + self.chunk.extend( list( s ) ) + + def replace( self, s, start, end ): + self.chunk + + def length( self ): + return len( self.chunk ) + + def string(self): + logging.debug( "Stringifying: %s" % self.chunk ) + return ''.join( self.chunk ) diff --git a/FuseArchive/ChunkFile.py b/FuseArchive/ChunkFile.py index 6a0ea34..ca7ee2b 100644 --- a/FuseArchive/ChunkFile.py +++ b/FuseArchive/ChunkFile.py @@ -2,17 +2,16 @@ import logging, os, errno, fcntl, fuse, FuseArchive, copy import FuseArchive.Storage.ZipFile, FuseArchive.Storage.FileSystem from binascii import hexlify from FuseArchive.Serializer import Serializer +from ChunkBuffer import ChunkBuffer # These control some of the file output -magic_blocksize = 1024 * 128 +magic_blocksize = 1024 * 1024 * 5 # Use a tiny block size to debug writes, so you can use a smaller test file #magic_blocksize = 1024 chunkstyle = 'fs' -# Memory for dirty blocks, per file (1M) -dirty_size = 1024 * 1024 * 1; # This is the number of actualy blocks in that size -dirty_flush = int( dirty_size / magic_blocksize ) +dirty_flush = 5 * magic_blocksize # This is a cache of open files by inode, to fix the lseek == size problem # this causes a failure in fsx-linux becuase to to lseek(fd,0,seek_end) it @@ -78,8 +77,8 @@ class ChunkFile(object): self.modified = False # This is the current in-memory chunk and offset in to data[] - self.chunk_cache = {}; - self.chunk = '' + self.chunk_cache = {} + self.chunk = ChunkBuffer() self.chunk_index = -1 self.chunk_modified = False self.chunk_size = magic_blocksize @@ -181,17 +180,17 @@ class ChunkFile(object): key = self.chunks[ index ] if key: - if isinstance( key, str ): + if isinstance( key, ChunkBuffer ): logging.debug( "Found cached dirty page" ) self.chunk = key else: logging.debug( "Index: %s" % key ) - self.chunk = load_chunk( key ) + self.chunk = ChunkBuffer( load_chunk( key ) ) else: logging.debug( "No chunk at this index, loading nothing" ) - self.chunk = '' + self.chunk = ChunkBuffer() - logging.debug( "Loaded chunk of length: %d" % len( self.chunk ) ) + logging.debug( "Loaded chunk of length: %d" % self.chunk.length() ) self.chunk_index = index self.chunk_modified = False @@ -204,11 +203,12 @@ class ChunkFile(object): # Make sure we have room for this chunk size = len( self.chunks ) if self.chunk_index >= size: - self.chunks.extend( [ '' ] * ( self.chunk_index -size + 1 ) ) + self.chunks.extend( [ ChunkBuffer() ] * ( self.chunk_index -size + 1 ) ) # Increment dirty chunks if we had a key here already + logging.debug( "Chunk is: %s" % self.chunks[ self.chunk_index ] ); if isinstance( self.chunks[ self.chunk_index ], list ) or \ - len( self.chunks[ self.chunk_index ] ) == 0: + self.chunks[ self.chunk_index ].length() == 0: self.dirty_chunks += 1 logging.debug( "Dirty chunks is now: %d" % self.dirty_chunks ) logging.debug( "Dirty flush at: %d" % dirty_flush ) @@ -223,14 +223,34 @@ class ChunkFile(object): # This flushes any cached chunks def _flush_chunks(self): for index in range( len( self.chunks ) ): - if isinstance( self.chunks[ index ], str ): + if isinstance( self.chunks[ index ], ChunkBuffer ): logging.debug( "Flushing chunk at %d" % index ) - key = save_chunk( self.chunks[ index ] ) + key = save_chunk( self.chunks[ index ].string() ) self.chunks[ index ] = key logging.debug( "Key was %s" % key ) self.dirty_chunks = 0 - self._update_chunk_references() + # If we had an old chunk here, free it + if len(self.original_chunks) >= index + 1: + oldkey = self.original_chunks[ index ] + if oldkey != key: + # Free this chunk + unlock_chunk( oldkey ) + # And keep this chunk + lock_chunk( key ) + # Else chunk didn't change, don't relock or anything + else: + # We did not have a chunk here so lock this chunk + lock_chunk( key ) + # And extend original chunks by 1 (we are walking + # sequentially so we don't need to worry about padding + # out intermediate chunks) + self.original_chunks.extend( [ ChunkBuffer() ] ) + + # And update the key in original chunks + self.original_chunks[ index ] = key + + #self._update_chunk_references() def read(self, length, offset): logging.debug( "Reading from %s offset: %d (0x%x) length: %d (0x%d)" % @@ -246,8 +266,8 @@ class ChunkFile(object): while data_read < length and not is_eof: logging.debug( "Pulling chunk data: %d" % index ) self._load_chunk( index ) - if len(self.chunk): - chunk_remaining = len(self.chunk) - rest + if self.chunk.length(): + chunk_remaining = self.chunk.length() - rest to_read = chunk_remaining data_left = length - data_read if data_left < chunk_remaining: @@ -259,7 +279,7 @@ class ChunkFile(object): logging.debug( "rest: %d" % rest ) logging.debug( "Copying %d bytes" % to_read ) - data += self.chunk[ rest:(rest+to_read) ] + data += self.chunk.string()[ rest:(rest+to_read) ] data_read += to_read index += 1 rest = 0 @@ -298,21 +318,20 @@ class ChunkFile(object): while this_index < index: self._load_chunk( this_index ) - fill_null = self.chunk_size - len(self.chunk) + fill_null = self.chunk_size - self.chunk.length() logging.debug( "Filling this chunk with null, bytes: %d" % fill_null ) - self.chunk += "\0" * fill_null - logging.debug( "Chunk is now: %d bytes" % len( self.chunk) ) + self.chunk.append( "\0" * fill_null ) + logging.debug( "Chunk is now: %d bytes" % self.chunk.length() ) self.chunk_modified = True self._save_chunk() this_index += 1 self._load_chunk( index ) - # Now check if this chunk needs to be extended - if len( self.chunk ) < rest: - fill_null = rest - len(self.chunk) + if self.chunk.length() < rest: + fill_null = rest - self.chunk.length() logging.debug( "Filling final chunk with null, bytes: %d" % fill_null ) - self.chunk += "\0" * fill_null + self.chunk.append( "\0" * fill_null ) self.chunk_modified = True self._save_chunk() @@ -344,13 +363,22 @@ class ChunkFile(object): logging.debug( "Pre-Buf: %s" % hexlify(buf) ) logging.debug( "Pre-Chunk: %s" % hexlify(self.chunk) ) - # Since python doesn't do in-place reassignment like you - # can with splice() we will reconstruct the data by joining - # stuff by offsets (first chars to skip, then our joining - # buf chunk, the everything that would have been after it) - self.chunk = self.chunk[ :rest ] + \ - buf[ buf_offset:(buf_offset+this_len) ] + \ - self.chunk[ (rest + this_len): ] + # Check if we are appending only, appends are much faster than + # splicing up string + if self.chunk.length() == rest and len( buf ) <= this_len: + logging.debug( "Doing quick append" ) + self.chunk.append( buf ) + else: + logging.debug( "SLOOOOW! Doing string splice" ) + # Since python doesn't do in-place reassignment like you + # can with splice() we will reconstruct the data by joining + # stuff by offsets (first chars to skip, then our joining + # buf chunk, the everything that would have been after it) + + # This sucks for moving around data, it is very slow! + self.chunk.replace( buf[ buf_offset:(buf_offset+this_len) ], + rest, rest + this_len ) + if FuseArchive.deep_debug: logging.debug( "Post-Buf: %s" % hexlify(buf) ) @@ -368,7 +396,7 @@ class ChunkFile(object): if offset + len(buf) > self.size: self.size = offset + len(buf) - logging.debug( "This chunk size is now: %d" % len( self.chunk ) ) + logging.debug( "This chunk size is now: %d" % self.chunk.length() ) logging.debug( "File size is now: %d" % self.size ) logging.debug( "Num Chunks: %d" % len( self.chunks ) ) @@ -405,7 +433,7 @@ class ChunkFile(object): logging.debug( "We have %d chunks, calculating size" % numchunks ) self._load_chunk( numchunks - 1 ) self.size = ( numchunks - 1 ) * self.chunk_size + \ - len( self.chunk ) + self.chunk.length() else: logging.debug( "No chunks, setting size to zero" ) self.size = 0 @@ -546,7 +574,7 @@ class ChunkFile(object): if length == 0: logging.debug( "Creating 0 chunk file" ) self.chunks = [] - self.chunk = '' + self.chunk = ChunkBuffer() elif self.size <= length: logging.debug( "Need to pad out file, writing/seeking to %d" % length ) @@ -567,13 +595,13 @@ class ChunkFile(object): # last chunk if len( self.chunks ): self._load_chunk( len( self.chunks ) - 1 ) - logging.debug( "Loaded final chunk, len: %d" % len( self.chunk ) ) + logging.debug( "Loaded final chunk, len: %d" % self.chunk.length() ) # Now truncate this item if needed - if len( self.chunk ) > extra_bytes: + if self.chunk.length() > extra_bytes: logging.debug( "Truncating final chunk to %d" % extra_bytes ) - self.chunk = self.chunk[ :extra_bytes ] - logging.debug( "Chunk is now: %d bytes" % len( self.chunk ) ) + self.chunk.truncate( extra_bytes ) + logging.debug( "Chunk is now: %d bytes" % self.chunk.length() ) self.chunk_modified = True self.modified = True -- cgit v0.10.2