From 54c2eb333a65d8b16f7edca0b7d0909f76088829 Mon Sep 17 00:00:00 2001
From: Steve Slaven <bpk@hoopajoo.net>
Date: Fri, 24 Jul 2009 14:15:23 -0700
Subject: Testing out using zip files for chunk storage to eliminate some of
 the huge overhead of having thousands of tiny files and directories


diff --git a/fusearchive.py b/fusearchive.py
index 4a860ee..faf465a 100755
--- a/fusearchive.py
+++ b/fusearchive.py
@@ -8,7 +8,7 @@
 #    See the file COPYING.
 #
 
-import os, sys, fcntl, fuse, sha, cPickle, gzip, errno
+import os, sys, fcntl, fuse, sha, cPickle, gzip, errno, zipfile
 from fuse import Fuse
 
 import pdb
@@ -42,9 +42,15 @@ def dmsg(level,message):
     if level <= debug_level:
         print str(level) + ": " + str(message)
 
+def save_chunk( chunk ):
+    return _save_chunk_zip( chunk )
+
+def load_chunk( key ):
+    return _load_chunk_zip( key )
+
 # This will write out a data block, it will return a key that can get this
 # data back later
-def save_chunk( chunk ):
+def _save_chunk_fs( chunk ):
     if magic_profiling:
         return( [ 0, 0 ] )
 
@@ -107,8 +113,68 @@ def save_chunk( chunk ):
     dmsg( 3, "Got chunk slot: " + str( sub ) )
     return( [ digest, sub ] )
 
+def _save_chunk_zip( chunk ):
+    if magic_profiling:
+        return( [ 0, 0 ] )
+
+    dmsg( 2, "Begin save_chunk, length: " + str( len( chunk ) ) )
+    if debug_level > 4:
+        dmsg( 5, "Chunk: " + str( chunk ) )
+
+    # Save this hash string, similar to the backuppc algo
+    digest = sha.new( chunk ).digest()
+
+    # Write out our chunk
+    chars = list( digest )
+    dmsg( 4, chars )
+
+    # We make the hexdigest here, yeah we could just call hexdigest()
+    # but we need to essentially do this same thing to reassemble the
+    # file anyway
+    hexdigest = ''.join( [ "%02x" % ord( x ) for x in chars ] )
+
+    # Should be about max of 32k zip files
+    zipname = hexdigest[ 0:4 ] + ".zip"
+    dmsg( 3, "Zip name: " + zipname )
+    if not os.path.exists( "./storage/" + zipname ):
+        dmsg( 3, "Creating intial empty zip" )
+        z = zipfile.ZipFile( "./storage/" + zipname, 'w', zipfile.ZIP_DEFLATED, True )
+        # append mode throws an exception if it's not zip, or maybe it's
+        # just zero-length files
+        z.writestr( 'junk', 'junk' )
+        z.close()
+
+    z = zipfile.ZipFile( "./storage/" + zipname, 'a', zipfile.ZIP_DEFLATED, True )
+
+    # Find a chunk slot
+    sub = 0
+    while True:
+        checkpath = hexdigest + "_" + str( sub )
+        dmsg( 3, "Checking: " + checkpath )
+        try:
+            data = z.read( checkpath )
+        except:
+            data = ''
+
+        if len(data):
+            if data == chunk:
+                dmsg( 3, "Found existing block" )
+                break
+            else:
+                dmsg( 3, "Block exists but is not the same" )
+                sub += 1
+        else:
+            # We found a spot, dump our data here
+            dmsg( 3, "No block here, creating new block" )
+            z.writestr( checkpath, chunk )
+            break
+
+    z.close()
+    dmsg( 3, "Got chunk slot: " + str( sub ) )
+    return( [ digest, sub ] )
+
 # This will return a data block by key that was saved previously
-def load_chunk( key ):
+def _load_chunk_fs( key ):
     if magic_profiling:
         return ''
 
@@ -142,6 +208,40 @@ def load_chunk( key ):
 
     return chunk
 
+def _load_chunk_zip( key ):
+    if magic_profiling:
+        return ''
+
+    ( thash, seq ) = key
+    dmsg( 2, "Begin load_chunk" )
+
+    chars = list( thash )
+    dmsg( 4, chars )
+
+    # Todo: make a digest -> path function to share with deflate
+    hexdigest = ''.join( [ "%02x" % ord( x ) for x in chars ] )
+
+    zipname = hexdigest[ 0:4 ] + ".zip"
+    dmsg( 3, "Zip name: " + zipname )
+    z = zipfile.ZipFile( "./storage/" + zipname, 'r', zipfile.ZIP_DEFLATED, True )
+
+    subpath = hexdigest + "_" + str( seq )
+    dmsg( 3, "Chunk path: " + subpath )
+    data = z.read( subpath )
+    if len( data ):
+        dmsg( 3, "Exporting chunk" )
+        chunk = data
+    else:
+        z.close()
+        raise IOError
+
+    if debug_level > 4:
+        dmsg( 5, "Load-Chunk: " + str( chunk ) )
+
+    z.close()
+    return chunk
+
+
 class FuseArchiveStream:
     """This just allows switching out writer classes easily"""
     @staticmethod
-- 
cgit v0.10.2