From 7a5a5d8514ce388fba208c3758220dc2c555a236 Mon Sep 17 00:00:00 2001 From: shenleban tongying Date: Sun, 24 Mar 2024 16:01:34 -0400 Subject: [PATCH] feat: use Zstd for index file compression instead of zlib --- CMake_Unix.cmake | 1 + src/btreeidx.cc | 35 +++++++++++++++++++++++------------ src/btreeidx.hh | 24 +++++++++++++++++++++--- src/chunkedstorage.cc | 32 ++++++++++++++++++++++---------- src/chunkedstorage.hh | 22 +++++++++++++++++++++- src/dict/mdx.cc | 3 ++- 6 files changed, 90 insertions(+), 27 deletions(-) diff --git a/CMake_Unix.cmake b/CMake_Unix.cmake index db066f1980..bce28da987 100644 --- a/CMake_Unix.cmake +++ b/CMake_Unix.cmake @@ -40,6 +40,7 @@ pkg_check_modules(PKGCONFIG_DEPS IMPORTED_TARGET vorbis # .ogg vorbisfile liblzma + libzstd xapian-core ) diff --git a/src/btreeidx.cc b/src/btreeidx.cc index 22132dc3d5..5e7e27f2a6 100644 --- a/src/btreeidx.cc +++ b/src/btreeidx.cc @@ -4,8 +4,7 @@ #include "btreeidx.hh" #include "folding.hh" #include "utf8.hh" -#include -#include + #include #include #include @@ -14,7 +13,6 @@ #include "wstring_qt.hh" #include "utils.hh" -#include #include "wildcard.hh" #include "globalbroadcaster.hh" @@ -36,6 +34,7 @@ BtreeIndex::BtreeIndex(): idxFile( nullptr ), rootNodeLoaded( false ) { + zstd_dctx.reset( ZSTD_createDCtx() ); } BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ): @@ -411,10 +410,15 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out ) unsigned long decompressedLength = out.size(); - if ( uncompress( (unsigned char *)&out.front(), &decompressedLength, &compressedData.front(), compressedData.size() ) - != Z_OK - || decompressedLength != out.size() ) + const size_t size_or_err = ZSTD_decompressDCtx( zstd_dctx.get(), + out.data(), + decompressedLength, + compressedData.data(), + compressedData.size() ); + + if ( ZSTD_isError( size_or_err ) || size_or_err != out.size() ) { throw exFailedToDecompressNode(); + } } char const * BtreeIndex::findChainOffsetExactOrPrefix( @@ -758,6 +762,10 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, size_t maxElements, uint32_t & lastLeafLinkOffset ) { + + std::unique_ptr< ZSTD_CCtx, zstd_deleter > zstd_cctx; + zstd_cctx.reset( ZSTD_createCCtx() ); + // We compress all the node data. This buffer would hold it. vector< unsigned char > uncompressedData; @@ -846,12 +854,15 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, } // Save the result. - vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) ); + vector< unsigned char > compressedData( ZSTD_compressBound( uncompressedData.size() ) ); - unsigned long compressedSize = compressedData.size(); + const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(), + compressedData.data(), + compressedData.size(), + uncompressedData.data(), + uncompressedData.size() ); - if ( compress( &compressedData.front(), &compressedSize, &uncompressedData.front(), uncompressedData.size() ) - != Z_OK ) { + if ( ZSTD_isError( size_or_err ) ) { qFatal( "Failed to compress btree node." ); abort(); } @@ -859,8 +870,8 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, uint32_t offset = file.tell(); file.write< uint32_t >( uncompressedData.size() ); - file.write< uint32_t >( compressedSize ); - file.write( &compressedData.front(), compressedSize ); + file.write< uint32_t >( size_or_err ); + file.write( &compressedData.front(), size_or_err ); if ( isLeaf ) { // A link to the next leef, which is zero and which will be updated diff --git a/src/btreeidx.hh b/src/btreeidx.hh index 77c905460f..15c5e47d3d 100644 --- a/src/btreeidx.hh +++ b/src/btreeidx.hh @@ -12,12 +12,13 @@ #include #include #include +#include #include -#include -#include #include +#include + /// A base for the dictionary which creates a btree index to look up /// the words. @@ -28,11 +29,25 @@ using gd::wstring; using std::vector; using std::map; +struct zstd_deleter +{ + void operator()( ZSTD_DCtx * Ctx ) const + { + ZSTD_freeDCtx( Ctx ); + } + + void operator()( ZSTD_CCtx * Ctx ) const + { + ZSTD_freeCCtx( Ctx ); + } +}; + + enum { /// This is to be bumped up each time the internal format changes. /// The value isn't used here by itself, it is supposed to be added /// to each dictionary's internal format version. - FormatVersion = 4 + FormatVersion = 5 }; // These exceptions which might be thrown during the index traversal @@ -139,6 +154,9 @@ protected: protected: + std::unique_ptr< ZSTD_DCtx, zstd_deleter > zstd_dctx; + + // Lifetime of 2 var below is not managed by this class. QMutex * idxFileMutex; File::Class * idxFile; diff --git a/src/chunkedstorage.cc b/src/chunkedstorage.cc index 9a33e7a4a7..1087a90677 100644 --- a/src/chunkedstorage.cc +++ b/src/chunkedstorage.cc @@ -2,10 +2,8 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "chunkedstorage.hh" -#include +#include #include -#include -#include #include namespace ChunkedStorage { @@ -19,6 +17,8 @@ Writer::Writer( File::Class & f ): chunkStarted( false ), bufferUsed( 0 ) { + zstd_cctx.reset( ZSTD_createCCtx() ); + // Create a sratchpad at the beginning of file. We use it to write chunk // table if it would fit, in order to save some seek times. @@ -64,21 +64,26 @@ void Writer::addToBlock( void const * data, size_t size ) void Writer::saveCurrentChunk() { - size_t maxCompressedSize = compressBound( bufferUsed ); + size_t maxCompressedSize = ZSTD_compressBound( bufferUsed ); if ( bufferCompressed.size() < maxCompressedSize ) bufferCompressed.resize( maxCompressedSize ); - unsigned long compressedSize = bufferCompressed.size(); - if ( compress( &bufferCompressed.front(), &compressedSize, &buffer.front(), bufferUsed ) != Z_OK ) + const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(), + bufferCompressed.data(), + bufferCompressed.size(), + buffer.data(), + bufferUsed ); + if ( ZSTD_isError( size_or_err ) ) { throw exFailedToCompressChunk(); + } offsets.push_back( file.tell() ); file.write( (uint32_t)bufferUsed ); - file.write( (uint32_t)compressedSize ); - file.write( &bufferCompressed.front(), compressedSize ); + file.write( (uint32_t)size_or_err ); + file.write( &bufferCompressed.front(), size_or_err ); bufferUsed = 0; @@ -118,6 +123,8 @@ uint32_t Writer::finish() Reader::Reader( File::Class & f, uint32_t offset ): file( f ) { + zstd_dctx.reset( ZSTD_createDCtx() ); + file.seek( offset ); uint32_t size = file.read< uint32_t >(); @@ -165,8 +172,13 @@ char * Reader::getBlock( uint32_t address, vector< char > & chunk ) unsigned long decompressedLength = chunk.size(); - if ( uncompress( (unsigned char *)&chunk.front(), &decompressedLength, chunkDataBytes, compressedSize ) != Z_OK - || decompressedLength != chunk.size() ) { + size_t const size_or_err = ZSTD_decompressDCtx( zstd_dctx.get(), + chunk.data(), + decompressedLength, + chunkDataBytes, + compressedSize ); + + if ( ZSTD_isError( size_or_err ) || size_or_err != chunk.size() ) { throw exFailedToDecompressChunk(); } } diff --git a/src/chunkedstorage.hh b/src/chunkedstorage.hh index 76e958ed5b..baae7b04d3 100644 --- a/src/chunkedstorage.hh +++ b/src/chunkedstorage.hh @@ -8,7 +8,9 @@ #include "file.hh" #include -#include +#include + +#include /// A chunked compression storage. We use this for articles' bodies. The idea /// is to store data in a separately-compressed chunks, much like in dictzip, @@ -27,6 +29,19 @@ DEF_EX( exAddressOutOfRange, "The given chunked address is out of range", Ex ) DEF_EX( exFailedToDecompressChunk, "Failed to decompress a chunk", Ex ) DEF_EX( mapFailed, "Failed to map/unmap the file", Ex ) +struct zstd_deleter +{ + void operator()( ZSTD_DCtx * Ctx ) const + { + ZSTD_freeDCtx( Ctx ); + } + + void operator()( ZSTD_CCtx * Ctx ) const + { + ZSTD_freeCCtx( Ctx ); + } +}; + /// This class writes data blocks in chunks. class Writer { @@ -66,6 +81,8 @@ private: size_t bufferUsed; void saveCurrentChunk(); + + std::unique_ptr zstd_cctx; }; /// This class reads data blocks previously written by Writer. @@ -83,6 +100,9 @@ public: /// Uses the user-provided storage to load the entire chunk, and then to /// return a pointer to the requested block inside it. char * getBlock( uint32_t address, vector< char > & ); + +private: + std::unique_ptr zstd_dctx; }; } // namespace ChunkedStorage diff --git a/src/dict/mdx.cc b/src/dict/mdx.cc index a4834cf166..bdc9ea74f2 100644 --- a/src/dict/mdx.cc +++ b/src/dict/mdx.cc @@ -1437,7 +1437,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f auto index_finish = std::chrono::high_resolution_clock::now(); std::chrono::duration c = index_finish - index_begin; - qDebug() << c; + qDebug() << c.count()<<"ms"; + //or qDebug() << c; for qt6.6 t = t - 1; if ( t > 0 ) {