Use a stringbuffer list instead of appending strings, apparently python

string handling when appending/using large strings is very bad
author: Steve Slaven <bpk@hoopajoo.net> 2009-11-02 23:36:12 (GMT)
committer: Steve Slaven <bpk@hoopajoo.net> 2009-11-02 23:36:12 (GMT)
commit: b4f754a596f8d262d0f3089c37bf47c73d8ccfc1 (patch)
tree: 11d2287b9bb38961b216c4b03de9c722eaee0a6f
parent: c44db1796c8389d89acd4122da6ffdd72998d6a0 (diff)
download: fusearchive-b4f754a596f8d262d0f3089c37bf47c73d8ccfc1.zip
fusearchive-b4f754a596f8d262d0f3089c37bf47c73d8ccfc1.tar.gz
fusearchive-b4f754a596f8d262d0f3089c37bf47c73d8ccfc1.tar.bz2
2 files changed, 88 insertions, 39 deletions
diff --git a/FuseArchive/ChunkBuffer.py b/FuseArchive/ChunkBuffer.py
new file mode 100644
index 0000000..4ef6370
--- /dev/null
+++ b/FuseArchive/ChunkBuffer.py
@@ -0,0 +1,21 @@
+import logging
+
+# Handle efficient operations on a non-fixed length buffer like appending,
+# replacing, reading chunks, etc
+class ChunkBuffer:
+    def __init__( self, data = '' ):
+        logging.debug( "Creating chunkbuffer: %s" % data )
+        self.chunk = list( data )
+
+    def append( self, s ):
+        self.chunk.extend( list( s ) )
+
+    def replace( self, s, start, end ):
+        self.chunk
+
+    def length( self ):
+        return len( self.chunk )
+
+    def string(self):
+        logging.debug( "Stringifying: %s" % self.chunk )
+        return ''.join( self.chunk )
diff --git a/FuseArchive/ChunkFile.py b/FuseArchive/ChunkFile.py
index 6a0ea34..ca7ee2b 100644
--- a/FuseArchive/ChunkFile.py
+++ b/FuseArchive/ChunkFile.py
@@ -2,17 +2,16 @@ import logging, os, errno, fcntl, fuse, FuseArchive, copy
 import FuseArchive.Storage.ZipFile, FuseArchive.Storage.FileSystem
 from binascii import hexlify
 from FuseArchive.Serializer import Serializer
+from ChunkBuffer import ChunkBuffer
 
 # These control some of the file output
-magic_blocksize = 1024 * 128
+magic_blocksize = 1024 * 1024 * 5
 # Use a tiny block size to debug writes, so you can use a smaller test file
 #magic_blocksize = 1024
 chunkstyle = 'fs'
 
-# Memory for dirty blocks, per file (1M)
-dirty_size = 1024 * 1024 * 1;
 # This is the number of actualy blocks in that size
-dirty_flush = int( dirty_size / magic_blocksize )
+dirty_flush = 5 * magic_blocksize
 
 # This is a cache of open files by inode, to fix the lseek == size problem
 # this causes a failure in fsx-linux becuase to to lseek(fd,0,seek_end) it
@@ -78,8 +77,8 @@ class ChunkFile(object):
         self.modified = False
 
         # This is the current in-memory chunk and offset in to data[]
-        self.chunk_cache = {};
-        self.chunk = ''
+        self.chunk_cache = {}
+        self.chunk = ChunkBuffer()
         self.chunk_index = -1
         self.chunk_modified = False
         self.chunk_size = magic_blocksize
@@ -181,17 +180,17 @@ class ChunkFile(object):
             key = self.chunks[ index ]
 
         if key:
-            if isinstance( key, str ):
+            if isinstance( key, ChunkBuffer ):
                 logging.debug( "Found cached dirty page" )
                 self.chunk = key
             else:
                 logging.debug( "Index: %s" % key )
-                self.chunk = load_chunk( key )
+                self.chunk = ChunkBuffer( load_chunk( key ) )
         else:
             logging.debug( "No chunk at this index, loading nothing" )
-            self.chunk = ''
+            self.chunk = ChunkBuffer()
 
-        logging.debug( "Loaded chunk of length: %d" % len( self.chunk ) )
+        logging.debug( "Loaded chunk of length: %d" % self.chunk.length() )
 
         self.chunk_index = index
         self.chunk_modified = False
@@ -204,11 +203,12 @@ class ChunkFile(object):
             # Make sure we have room for this chunk
             size = len( self.chunks )
             if self.chunk_index >= size:
-                self.chunks.extend( [ '' ] * ( self.chunk_index  -size + 1 ) )
+                self.chunks.extend( [ ChunkBuffer() ] * ( self.chunk_index  -size + 1 ) )
 
             # Increment dirty chunks if we had a key here already
+            logging.debug( "Chunk is: %s" % self.chunks[ self.chunk_index ] );
             if isinstance( self.chunks[ self.chunk_index ], list ) or \
-                    len( self.chunks[ self.chunk_index ] ) == 0:
+                    self.chunks[ self.chunk_index ].length() == 0:
                 self.dirty_chunks += 1
                 logging.debug( "Dirty chunks is now: %d" % self.dirty_chunks )
                 logging.debug( "Dirty flush at: %d" % dirty_flush )
@@ -223,14 +223,34 @@ class ChunkFile(object):
     # This flushes any cached chunks
     def _flush_chunks(self):
         for index in range( len( self.chunks ) ):
-            if isinstance( self.chunks[ index ], str ):
+            if isinstance( self.chunks[ index ], ChunkBuffer ):
                 logging.debug( "Flushing chunk at %d" % index )
-                key = save_chunk( self.chunks[ index ] )
+                key = save_chunk( self.chunks[ index ].string() )
                 self.chunks[ index ] = key
                 logging.debug( "Key was %s" % key )
                 self.dirty_chunks = 0
 
-        self._update_chunk_references()
+                # If we had an old chunk here, free it
+                if len(self.original_chunks) >= index + 1:
+                    oldkey = self.original_chunks[ index ]
+                    if oldkey != key:
+                        # Free this chunk
+                        unlock_chunk( oldkey )
+                        # And keep this chunk
+                        lock_chunk( key )
+                    # Else chunk didn't change, don't relock or anything
+                else:
+                    # We did not have a chunk here so lock this chunk
+                    lock_chunk( key )
+                    # And extend original chunks by 1 (we are walking
+                    # sequentially so we don't need to worry about padding
+                    # out intermediate chunks)
+                    self.original_chunks.extend( [ ChunkBuffer() ] )
+
+                # And update the key in original chunks
+                self.original_chunks[ index ] = key
+
+        #self._update_chunk_references()
 
     def read(self, length, offset):
         logging.debug( "Reading from %s offset: %d (0x%x) length: %d (0x%d)" %
@@ -246,8 +266,8 @@ class ChunkFile(object):
         while data_read < length and not is_eof:
             logging.debug( "Pulling chunk data: %d" % index )
             self._load_chunk( index )
-            if len(self.chunk):
-                chunk_remaining = len(self.chunk) - rest
+            if self.chunk.length():
+                chunk_remaining = self.chunk.length() - rest
                 to_read = chunk_remaining
                 data_left = length - data_read
                 if data_left < chunk_remaining:
@@ -259,7 +279,7 @@ class ChunkFile(object):
                 logging.debug( "rest: %d" % rest )
                 logging.debug( "Copying %d bytes" % to_read )
 
-                data += self.chunk[ rest:(rest+to_read) ]
+                data += self.chunk.string()[ rest:(rest+to_read) ]
                 data_read += to_read
                 index += 1
                 rest = 0
@@ -298,21 +318,20 @@ class ChunkFile(object):
 
             while this_index < index:
                 self._load_chunk( this_index )
-                fill_null = self.chunk_size - len(self.chunk)
+                fill_null = self.chunk_size - self.chunk.length()
                 logging.debug( "Filling this chunk with null, bytes: %d" % fill_null )
-                self.chunk += "\0" * fill_null
-                logging.debug( "Chunk is now: %d bytes" % len( self.chunk) )
+                self.chunk.append( "\0" * fill_null )
+                logging.debug( "Chunk is now: %d bytes" % self.chunk.length() )
                 self.chunk_modified = True
                 self._save_chunk()
                 this_index += 1
 
         self._load_chunk( index )
 
-        # Now check if this chunk needs to be extended
-        if len( self.chunk ) < rest:
-            fill_null = rest - len(self.chunk)
+        if self.chunk.length() < rest:
+            fill_null = rest - self.chunk.length()
             logging.debug( "Filling final chunk with null, bytes: %d" % fill_null )
-            self.chunk += "\0" * fill_null
+            self.chunk.append( "\0" * fill_null )
             self.chunk_modified = True
             self._save_chunk()
 
@@ -344,13 +363,22 @@ class ChunkFile(object):
                 logging.debug( "Pre-Buf: %s" % hexlify(buf) )
                 logging.debug( "Pre-Chunk: %s" % hexlify(self.chunk) )
 
-            # Since python doesn't do in-place reassignment like you
-            # can with splice() we will reconstruct the data by joining
-            # stuff by offsets (first chars to skip, then our joining
-            # buf chunk, the everything that would have been after it)
-            self.chunk = self.chunk[ :rest ] + \
-                buf[ buf_offset:(buf_offset+this_len) ] + \
-                self.chunk[ (rest + this_len): ]
+            # Check if we are appending only, appends are much faster than
+            # splicing up string
+            if self.chunk.length() == rest and len( buf ) <= this_len:
+                logging.debug( "Doing quick append" )
+                self.chunk.append( buf )
+            else:
+                logging.debug( "SLOOOOW!  Doing string splice" )
+                # Since python doesn't do in-place reassignment like you
+                # can with splice() we will reconstruct the data by joining
+                # stuff by offsets (first chars to skip, then our joining
+                # buf chunk, the everything that would have been after it)
+
+                # This sucks for moving around data, it is very slow!
+                self.chunk.replace( buf[ buf_offset:(buf_offset+this_len) ],
+                    rest, rest + this_len )
+
 
             if FuseArchive.deep_debug:
                 logging.debug( "Post-Buf: %s" % hexlify(buf) )
@@ -368,7 +396,7 @@ class ChunkFile(object):
         if offset + len(buf) > self.size:
             self.size = offset + len(buf)
 
-        logging.debug( "This chunk size is now: %d" % len( self.chunk ) )
+        logging.debug( "This chunk size is now: %d" % self.chunk.length() )
         logging.debug( "File size is now: %d" % self.size )
         logging.debug( "Num Chunks: %d" % len( self.chunks ) )
 
@@ -405,7 +433,7 @@ class ChunkFile(object):
                 logging.debug( "We have %d chunks, calculating size" % numchunks )
                 self._load_chunk( numchunks - 1 )
                 self.size = ( numchunks - 1 ) * self.chunk_size + \
-                    len( self.chunk )
+                    self.chunk.length()
             else:
                 logging.debug( "No chunks, setting size to zero" )
                 self.size = 0
@@ -546,7 +574,7 @@ class ChunkFile(object):
         if length == 0:
             logging.debug( "Creating 0 chunk file" )
             self.chunks = []
-            self.chunk = ''
+            self.chunk = ChunkBuffer()
         elif self.size <= length:
             logging.debug( "Need to pad out file, writing/seeking to %d" % length )
 
@@ -567,13 +595,13 @@ class ChunkFile(object):
             # last chunk
             if len( self.chunks ):
                 self._load_chunk( len( self.chunks ) - 1 )
-                logging.debug( "Loaded final chunk, len: %d" % len( self.chunk ) )
+                logging.debug( "Loaded final chunk, len: %d" % self.chunk.length() )
 
             # Now truncate this item if needed
-            if len( self.chunk ) > extra_bytes:
+            if self.chunk.length() > extra_bytes:
                 logging.debug( "Truncating final chunk to %d" % extra_bytes )
-                self.chunk = self.chunk[ :extra_bytes ]
-                logging.debug( "Chunk is now: %d bytes" % len( self.chunk ) )
+                self.chunk.truncate( extra_bytes )
+                logging.debug( "Chunk is now: %d bytes" % self.chunk.length() )
 
         self.chunk_modified = True
         self.modified = True
author	Steve Slaven <bpk@hoopajoo.net>	2009-11-02 23:36:12 (GMT)
committer	Steve Slaven <bpk@hoopajoo.net>	2009-11-02 23:36:12 (GMT)
commit	b4f754a596f8d262d0f3089c37bf47c73d8ccfc1 (patch)
tree	11d2287b9bb38961b216c4b03de9c722eaee0a6f
parent	c44db1796c8389d89acd4122da6ffdd72998d6a0 (diff)
download	fusearchive-b4f754a596f8d262d0f3089c37bf47c73d8ccfc1.zip fusearchive-b4f754a596f8d262d0f3089c37bf47c73d8ccfc1.tar.gz fusearchive-b4f754a596f8d262d0f3089c37bf47c73d8ccfc1.tar.bz2