Friday, July 19, 2013

Streaming a Zip64 archive

One of my projects allowed the user to request a ZIP archive of files off the server, instead of downloading them individually. To avoid requiring (potentially) double the disk space, I wanted to stream the archive as I created it. Additionally, the total size of the uncompressed data could easily exceed 232 bytes, so I needed to use the Zip64 extensions.

In addition to providing the compression primitives, the excellent zlib package ships with a contributed ZIP generator minizip. Unfortunately, this generator uses seek to move back and forth within the generated file, so it wasn't suitable for my needs.

I eventually wrote my own utility to do this streaming. It took quite a few tries, as the spec is confusing and ambiguous. Once I figured out the spec, I wrote the code to always assume the worst (file size, number of files, etc); this costs a few extra bytes, but saves tons of logic.

The most interesting bits, at least regarding the ambiguous wording of the specification, are when we add a single file to the archive in Zip64Streamer::addFile:

bool
Zip64Streamer::addFile( const string & file )
{
    DEBUG( "af: adding file " << QS( file ) );

    FileInfo fi;
    fi.path = m_sDir + "/" + file;
    fi.name = file;
    fi.offset = m_offset;

    fillDateTime( fi );

    CharBuffer z64;
    write2( z64, Z64_EXTRA_FIELD_TAG );
    write2( z64, LENGTH_PLACEHOLDER );
    write8( z64, DEFER_UNCOMPRESSED_SIZE );
    write8( z64, DEFER_COMPRESSED_SIZE );
    fixupExtraFieldLength( z64 );

    CharBuffer unix;
    write2( unix, UNIX_EXTRA_FIELD_TAG );
    write2( unix, LENGTH_PLACEHOLDER );
    write4( unix, fi.stat_atime );
    write4( unix, fi.stat_mtime );
    write2( unix, UNIX_ZIP_UID );
    write2( unix, UNIX_ZIP_GID );
    fixupExtraFieldLength( unix );

    CharBuffer lh; // local header
    write4( lh, LOCAL_FILE_HEADER_SIG );
    write2( lh, VERSION_NEEDED_TO_EXTRACT_4_5 );
    write2( lh, GPB_DATA_DESC_FOLLOWS_DATA );
    write2( lh, COMPRESSION_METHOD_DEFLATE );
    write2( lh, fi.msdos_time );
    write2( lh, fi.msdos_date );
    write4( lh, DEFER_CRC32 );
    write4( lh, FORCE_Z64_COMPRESSED_SIZE );
    write4( lh, FORCE_Z64_UNCOMPRESSED_SIZE );
    write2( lh, static_cast< uint16_t >( fi.name.size() ) );
    write2( lh, static_cast< uint16_t >( z64.size() + unix.size() ) );

    FINE( "af: " << file << ": writing header" );

    emit( lh );
    emitCopy( fi.name );
    emit( z64 );
    emit( unix );

    emitCompressedData( fi );

    FINE( "af: " << file << ": writing descriptor" );
    CharBuffer dd; // data descriptor
    write4( dd, DATA_DESC_SIG );
    write4( dd, fi.crc32 );
    write8( dd, fi.compressed );
    write8( dd, fi.uncompressed );
    emit( dd );

    // save info for eventual use in central directory
    m_fileInfo.push_back( fi );

    return true;
}

After we've added all the files, we finalize the archive in the destructor:

Zip64Streamer::~Zip64Streamer()
{
    DEBUG( "dtor: finishing zip file" );

    // save start of central directory
    const uint64_t centralDirOffset( m_offset );

    // emit a central directory record for each file
    for ( const FileInfo & fi : m_fileInfo )
    {
        FINE( "dtor: adding central dir record for " << QS( fi.name ) );

        CharBuffer z64;
        write2( z64, Z64_EXTRA_FIELD_TAG );
        write2( z64, LENGTH_PLACEHOLDER );
        write8( z64, fi.uncompressed );
        write8( z64, fi.compressed );
        write8( z64, fi.offset );
        fixupExtraFieldLength( z64 );

        CharBuffer unix;
        write2( unix, UNIX_EXTRA_FIELD_TAG );
        write2( unix, LENGTH_PLACEHOLDER );
        write4( unix, fi.stat_atime );
        write4( unix, fi.stat_mtime );
        write2( unix, UNIX_ZIP_UID );
        write2( unix, UNIX_ZIP_GID );
        fixupExtraFieldLength( unix );

        CharBuffer cd;
        write4( cd, CDIR_FILE_HEADER_SIG );
        write2( cd, VERSION_CREATED_BY_4_5_UNIX );
        write2( cd, VERSION_NEEDED_TO_EXTRACT_4_5 );
        write2( cd, GPB_DATA_DESC_FOLLOWS_DATA );
        write2( cd, COMPRESSION_METHOD_DEFLATE );
        write2( cd, fi.msdos_time );
        write2( cd, fi.msdos_date );
        write4( cd, fi.crc32 );
        write4( cd, FORCE_Z64_COMPRESSED_SIZE );
        write4( cd, FORCE_Z64_UNCOMPRESSED_SIZE );
        write2( cd, static_cast< uint16_t >( fi.name.size() ) );
        write2( cd, static_cast< uint16_t >( z64.size() + unix.size() ) );
        write2( cd, ZERO_COMMENT_LENGTH );
        write2( cd, DISK_START_ZERO );
        write2( cd, ZERO_INTERNAL_FILE_ATTR );
        write4( cd, UNIX_EXTERNAL_FILE_ATTR );
        write4( cd, FORCE_Z64_OFFSET );

        emit( cd );
        emitCopy( fi.name );
        emit( z64 );
        emit( unix );
    }

    // how many bytes did that use?
    const uint64_t centralDirBytes( m_offset - centralDirOffset );

    // and where are we now?
    const uint64_t z64EndOfCentralDirLoc( m_offset );

    FINE( "dtor: central dir: "
          "bytes=" << centralDirBytes << ", "
          "offset=" << centralDirOffset );

    DEBUG( "dtor: adding z64 end of central directory record @ " << m_offset );
    CharBuffer z64;
    write4( z64, Z64_END_OF_CENTRAL_DIR_REC_SIG );
    write8( z64, LENGTH_PLACEHOLDER );
    write2( z64, VERSION_CREATED_BY_4_5_UNIX );
    write2( z64, VERSION_NEEDED_TO_EXTRACT_4_5 );
    write4( z64, DISK_NUMBER_ZERO );
    write4( z64, DISK_START_ZERO );
    write8( z64, m_fileInfo.size() ); // # dir entries on this disk
    write8( z64, m_fileInfo.size() ); // # dir entries total
    write8( z64, centralDirBytes );
    write8( z64, centralDirOffset );
    fixupRecordLength64( z64 );
    emit( z64 );

    DEBUG( "dtor: adding z64 end of central directory locator @ " << m_offset );
    CharBuffer loc;
    write4( loc, Z64_END_OF_CENTRAL_DIR_LOC_SIG );
    write4( loc, DISK_NUMBER_ZERO );
    write8( loc, z64EndOfCentralDirLoc );
    write4( loc, DISK_TOTAL_ONE );
    emit( loc );

    DEBUG( "dtor: adding end of central directory record @ " << m_offset);
    CharBuffer end;
    write4( end, END_OF_CENTRAL_DIR_SIG );
    write2( end, DISK_NUMBER_ZERO );
    write2( end, DISK_START_ZERO );
    write2( end, FORCE_Z64_LOCAL_ENTRIES );
    write2( end, FORCE_Z64_TOTAL_ENTRIES );
    write4( end, FORCE_Z64_CDIR_SIZE );
    write4( end, FORCE_Z64_CDIR_OFFSET );
    write2( end, ZERO_COMMENT_LENGTH );
    emit( end );

    DEBUG( "dtor: finalizing zlib" );
    deflateEnd( &m_zs );

    DEBUG( "dtor: done" );
}

The full implementation can be found on github at https://github.com/tkil/ajf-prog/tree/master/Zip64Streamer. Hopefully others will find it useful.

Happy hacking!