diff --git a/CMake_Unix.cmake b/CMake_Unix.cmake index db066f198..bce28da98 100644 --- a/CMake_Unix.cmake +++ b/CMake_Unix.cmake @@ -40,6 +40,7 @@ pkg_check_modules(PKGCONFIG_DEPS IMPORTED_TARGET vorbis # .ogg vorbisfile liblzma + libzstd xapian-core ) diff --git a/goldendict.pro b/goldendict.pro index c45fc1572..27e78d2fa 100644 --- a/goldendict.pro +++ b/goldendict.pro @@ -117,7 +117,8 @@ UI_DIR = build MOC_DIR = build RCC_DIR = build LIBS += -lbz2 \ - -llzo2 + -llzo2 \ + -lzstd win32{ Debug: LIBS+= -L$$PWD/winlibs/lib/dbg/ -lzlibd diff --git a/src/btreeidx.cc b/src/btreeidx.cc index 22132dc3d..8a03f19cb 100644 --- a/src/btreeidx.cc +++ b/src/btreeidx.cc @@ -4,8 +4,7 @@ #include "btreeidx.hh" #include "folding.hh" #include "utf8.hh" -#include -#include + #include #include #include @@ -14,7 +13,6 @@ #include "wstring_qt.hh" #include "utils.hh" -#include #include "wildcard.hh" #include "globalbroadcaster.hh" @@ -36,6 +34,7 @@ BtreeIndex::BtreeIndex(): idxFile( nullptr ), rootNodeLoaded( false ) { + zstd_dctx.reset( ZSTD_createDCtx() ); } BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ): @@ -409,12 +408,12 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out ) idxFile->read( &compressedData.front(), compressedData.size() ); - unsigned long decompressedLength = out.size(); + const size_t size_or_err = + ZSTD_decompressDCtx( zstd_dctx.get(), out.data(), out.size(), compressedData.data(), compressedData.size() ); - if ( uncompress( (unsigned char *)&out.front(), &decompressedLength, &compressedData.front(), compressedData.size() ) - != Z_OK - || decompressedLength != out.size() ) + if ( ZSTD_isError( size_or_err ) || size_or_err != out.size() ) { throw exFailedToDecompressNode(); + } } char const * BtreeIndex::findChainOffsetExactOrPrefix( @@ -758,6 +757,10 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, size_t maxElements, uint32_t & lastLeafLinkOffset ) { + + std::unique_ptr< ZSTD_CCtx, ZSTD::deleter > zstd_cctx; + zstd_cctx.reset( ZSTD_createCCtx() ); + // We compress all the node data. This buffer would hold it. vector< unsigned char > uncompressedData; @@ -846,12 +849,15 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, } // Save the result. - vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) ); + vector< unsigned char > compressedData( ZSTD_compressBound( uncompressedData.size() ) ); - unsigned long compressedSize = compressedData.size(); + const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(), + compressedData.data(), + compressedData.size(), + uncompressedData.data(), + uncompressedData.size() ); - if ( compress( &compressedData.front(), &compressedSize, &uncompressedData.front(), uncompressedData.size() ) - != Z_OK ) { + if ( ZSTD_isError( size_or_err ) ) { qFatal( "Failed to compress btree node." ); abort(); } @@ -859,8 +865,8 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, uint32_t offset = file.tell(); file.write< uint32_t >( uncompressedData.size() ); - file.write< uint32_t >( compressedSize ); - file.write( &compressedData.front(), compressedSize ); + file.write< uint32_t >( size_or_err ); + file.write( &compressedData.front(), size_or_err ); if ( isLeaf ) { // A link to the next leef, which is zero and which will be updated diff --git a/src/btreeidx.hh b/src/btreeidx.hh index 77c905460..23245e080 100644 --- a/src/btreeidx.hh +++ b/src/btreeidx.hh @@ -12,12 +12,13 @@ #include #include #include +#include #include -#include -#include #include +#include "zstd_wrapper.hh" + /// A base for the dictionary which creates a btree index to look up /// the words. @@ -28,11 +29,12 @@ using gd::wstring; using std::vector; using std::map; + enum { /// This is to be bumped up each time the internal format changes. /// The value isn't used here by itself, it is supposed to be added /// to each dictionary's internal format version. - FormatVersion = 4 + FormatVersion = 5 }; // These exceptions which might be thrown during the index traversal @@ -139,6 +141,9 @@ protected: protected: + std::unique_ptr< ZSTD_DCtx, ZSTD::deleter > zstd_dctx; + + // Lifetime of 2 var below is not managed by this class. QMutex * idxFileMutex; File::Class * idxFile; diff --git a/src/chunkedstorage.cc b/src/chunkedstorage.cc index 9a33e7a4a..7d0ab0558 100644 --- a/src/chunkedstorage.cc +++ b/src/chunkedstorage.cc @@ -2,11 +2,12 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "chunkedstorage.hh" -#include +#include #include + #include -#include #include +#include namespace ChunkedStorage { @@ -19,6 +20,8 @@ Writer::Writer( File::Class & f ): chunkStarted( false ), bufferUsed( 0 ) { + zstd_cctx.reset( ZSTD_createCCtx() ); + // Create a sratchpad at the beginning of file. We use it to write chunk // table if it would fit, in order to save some seek times. @@ -64,21 +67,22 @@ void Writer::addToBlock( void const * data, size_t size ) void Writer::saveCurrentChunk() { - size_t maxCompressedSize = compressBound( bufferUsed ); - if ( bufferCompressed.size() < maxCompressedSize ) + if ( size_t maxCompressedSize = ZSTD_compressBound( bufferUsed ); bufferCompressed.size() < maxCompressedSize ) bufferCompressed.resize( maxCompressedSize ); - unsigned long compressedSize = bufferCompressed.size(); + const size_t size_or_err = + ZSTD_compress2( zstd_cctx.get(), bufferCompressed.data(), bufferCompressed.size(), buffer.data(), bufferUsed ); - if ( compress( &bufferCompressed.front(), &compressedSize, &buffer.front(), bufferUsed ) != Z_OK ) + if ( ZSTD_isError( size_or_err ) ) { throw exFailedToCompressChunk(); + } offsets.push_back( file.tell() ); file.write( (uint32_t)bufferUsed ); - file.write( (uint32_t)compressedSize ); - file.write( &bufferCompressed.front(), compressedSize ); + file.write( (uint32_t)size_or_err ); + file.write( bufferCompressed.data(), size_or_err ); bufferUsed = 0; @@ -118,6 +122,8 @@ uint32_t Writer::finish() Reader::Reader( File::Class & f, uint32_t offset ): file( f ) { + zstd_dctx.reset( ZSTD_createDCtx() ); + file.seek( offset ); uint32_t size = file.read< uint32_t >(); @@ -163,10 +169,11 @@ char * Reader::getBlock( uint32_t address, vector< char > & chunk ) } ); Q_UNUSED( autoUnmap ) - unsigned long decompressedLength = chunk.size(); - if ( uncompress( (unsigned char *)&chunk.front(), &decompressedLength, chunkDataBytes, compressedSize ) != Z_OK - || decompressedLength != chunk.size() ) { + size_t const size_or_err = + ZSTD_decompressDCtx( zstd_dctx.get(), chunk.data(), chunk.size(), chunkDataBytes, compressedSize ); + + if ( ZSTD_isError( size_or_err ) || size_or_err != chunk.size() ) { throw exFailedToDecompressChunk(); } } diff --git a/src/chunkedstorage.hh b/src/chunkedstorage.hh index 76e958ed5..e856ec5b3 100644 --- a/src/chunkedstorage.hh +++ b/src/chunkedstorage.hh @@ -8,7 +8,9 @@ #include "file.hh" #include -#include +#include + +#include "zstd_wrapper.hh" /// A chunked compression storage. We use this for articles' bodies. The idea /// is to store data in a separately-compressed chunks, much like in dictzip, @@ -66,6 +68,8 @@ private: size_t bufferUsed; void saveCurrentChunk(); + + std::unique_ptr< ZSTD_CCtx, ZSTD::deleter > zstd_cctx; }; /// This class reads data blocks previously written by Writer. @@ -83,6 +87,9 @@ public: /// Uses the user-provided storage to load the entire chunk, and then to /// return a pointer to the requested block inside it. char * getBlock( uint32_t address, vector< char > & ); + +private: + std::unique_ptr< ZSTD_DCtx, ZSTD::deleter > zstd_dctx; }; } // namespace ChunkedStorage diff --git a/src/dict/mdx.cc b/src/dict/mdx.cc index ba5a50766..d410c599b 100644 --- a/src/dict/mdx.cc +++ b/src/dict/mdx.cc @@ -1270,11 +1270,13 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f string dictId = Dictionary::makeDictionaryId( dictFiles ); string indexFile = indicesDir + dictId; - - if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( dictFiles, indexFile ) ) { + int t = 20; + mdxbench: + if ( true ) { // Building the index - gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() ); + // gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() ); + auto index_begin = std::chrono::high_resolution_clock::now(); MdictParser parser; list< sptr< MdictParser > > mddParsers; @@ -1432,6 +1434,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idx.rewind(); idx.write( &idxHeader, sizeof( idxHeader ) ); + + auto index_finish = std::chrono::high_resolution_clock::now(); + std::chrono::duration< double, std::milli > c = index_finish - index_begin; + qDebug() << c.count() << "ms"; + //or qDebug() << c; for qt6.6 + + t = t - 1; + if ( t > 0 ) { + goto mdxbench; + } } dictionaries.push_back( std::make_shared< MdxDictionary >( dictId, indexFile, dictFiles ) ); diff --git a/src/zstd_wrapper.hh b/src/zstd_wrapper.hh new file mode 100644 index 000000000..3b0cc521e --- /dev/null +++ b/src/zstd_wrapper.hh @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace ZSTD { + +struct deleter +{ + void operator()( ZSTD_DCtx * Ctx ) const + { + ZSTD_freeDCtx( Ctx ); + } + + void operator()( ZSTD_CCtx * Ctx ) const + { + ZSTD_freeCCtx( Ctx ); + } +}; + +} // namespace ZSTD