Skip to content

Commit

Permalink
feat: use Zstd for index file compression instead of zlib
Browse files Browse the repository at this point in the history
  • Loading branch information
shenlebantongying committed Mar 24, 2024
1 parent 2cea76f commit 7a5a5d8
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 27 deletions.
1 change: 1 addition & 0 deletions CMake_Unix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pkg_check_modules(PKGCONFIG_DEPS IMPORTED_TARGET
vorbis # .ogg
vorbisfile
liblzma
libzstd
xapian-core
)

Expand Down
35 changes: 23 additions & 12 deletions src/btreeidx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include "btreeidx.hh"
#include "folding.hh"
#include "utf8.hh"
#include <QRunnable>
#include <QThreadPool>

#include <QSemaphore>
#include <math.h>
#include <string.h>
Expand All @@ -14,7 +13,6 @@
#include "wstring_qt.hh"
#include "utils.hh"

#include <QRegularExpression>
#include "wildcard.hh"
#include "globalbroadcaster.hh"

Expand All @@ -36,6 +34,7 @@ BtreeIndex::BtreeIndex():
idxFile( nullptr ),
rootNodeLoaded( false )
{
zstd_dctx.reset( ZSTD_createDCtx() );
}

BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ):
Expand Down Expand Up @@ -411,10 +410,15 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out )

unsigned long decompressedLength = out.size();

if ( uncompress( (unsigned char *)&out.front(), &decompressedLength, &compressedData.front(), compressedData.size() )
!= Z_OK
|| decompressedLength != out.size() )
const size_t size_or_err = ZSTD_decompressDCtx( zstd_dctx.get(),
out.data(),
decompressedLength,
compressedData.data(),
compressedData.size() );

if ( ZSTD_isError( size_or_err ) || size_or_err != out.size() ) {
throw exFailedToDecompressNode();
}
}

char const * BtreeIndex::findChainOffsetExactOrPrefix(
Expand Down Expand Up @@ -758,6 +762,10 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
size_t maxElements,
uint32_t & lastLeafLinkOffset )
{

std::unique_ptr< ZSTD_CCtx, zstd_deleter > zstd_cctx;
zstd_cctx.reset( ZSTD_createCCtx() );

// We compress all the node data. This buffer would hold it.
vector< unsigned char > uncompressedData;

Expand Down Expand Up @@ -846,21 +854,24 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
}

// Save the result.
vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) );
vector< unsigned char > compressedData( ZSTD_compressBound( uncompressedData.size() ) );

unsigned long compressedSize = compressedData.size();
const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(),
compressedData.data(),
compressedData.size(),
uncompressedData.data(),
uncompressedData.size() );

if ( compress( &compressedData.front(), &compressedSize, &uncompressedData.front(), uncompressedData.size() )
!= Z_OK ) {
if ( ZSTD_isError( size_or_err ) ) {
qFatal( "Failed to compress btree node." );
abort();
}

uint32_t offset = file.tell();

file.write< uint32_t >( uncompressedData.size() );
file.write< uint32_t >( compressedSize );
file.write( &compressedData.front(), compressedSize );
file.write< uint32_t >( size_or_err );
file.write( &compressedData.front(), size_or_err );

if ( isLeaf ) {
// A link to the next leef, which is zero and which will be updated
Expand Down
24 changes: 21 additions & 3 deletions src/btreeidx.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
#include <stdint.h>
#include <string>
#include <vector>
#include <memory>

#include <QFuture>
#include <QList>
#include <QSet>
#include <QVector>

#include <zstd.h>


/// A base for the dictionary which creates a btree index to look up
/// the words.
Expand All @@ -28,11 +29,25 @@ using gd::wstring;
using std::vector;
using std::map;

struct zstd_deleter
{
void operator()( ZSTD_DCtx * Ctx ) const
{
ZSTD_freeDCtx( Ctx );
}

void operator()( ZSTD_CCtx * Ctx ) const
{
ZSTD_freeCCtx( Ctx );
}
};


enum {
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 4
FormatVersion = 5
};

// These exceptions which might be thrown during the index traversal
Expand Down Expand Up @@ -139,6 +154,9 @@ protected:

protected:

std::unique_ptr< ZSTD_DCtx, zstd_deleter > zstd_dctx;

// Lifetime of 2 var below is not managed by this class.
QMutex * idxFileMutex;
File::Class * idxFile;

Expand Down
32 changes: 22 additions & 10 deletions src/chunkedstorage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */

#include "chunkedstorage.hh"
#include <zlib.h>
#include <zstd.h>
#include <string.h>
#include <QDataStream>
#include <QScopeGuard>
#include <QMutexLocker>

namespace ChunkedStorage {
Expand All @@ -19,6 +17,8 @@ Writer::Writer( File::Class & f ):
chunkStarted( false ),
bufferUsed( 0 )
{
zstd_cctx.reset( ZSTD_createCCtx() );

// Create a sratchpad at the beginning of file. We use it to write chunk
// table if it would fit, in order to save some seek times.

Expand Down Expand Up @@ -64,21 +64,26 @@ void Writer::addToBlock( void const * data, size_t size )

void Writer::saveCurrentChunk()
{
size_t maxCompressedSize = compressBound( bufferUsed );
size_t maxCompressedSize = ZSTD_compressBound( bufferUsed );

if ( bufferCompressed.size() < maxCompressedSize )
bufferCompressed.resize( maxCompressedSize );

unsigned long compressedSize = bufferCompressed.size();

if ( compress( &bufferCompressed.front(), &compressedSize, &buffer.front(), bufferUsed ) != Z_OK )
const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(),
bufferCompressed.data(),
bufferCompressed.size(),
buffer.data(),
bufferUsed );
if ( ZSTD_isError( size_or_err ) ) {
throw exFailedToCompressChunk();
}

offsets.push_back( file.tell() );

file.write( (uint32_t)bufferUsed );
file.write( (uint32_t)compressedSize );
file.write( &bufferCompressed.front(), compressedSize );
file.write( (uint32_t)size_or_err );
file.write( &bufferCompressed.front(), size_or_err );

bufferUsed = 0;

Expand Down Expand Up @@ -118,6 +123,8 @@ uint32_t Writer::finish()
Reader::Reader( File::Class & f, uint32_t offset ):
file( f )
{
zstd_dctx.reset( ZSTD_createDCtx() );

file.seek( offset );

uint32_t size = file.read< uint32_t >();
Expand Down Expand Up @@ -165,8 +172,13 @@ char * Reader::getBlock( uint32_t address, vector< char > & chunk )

unsigned long decompressedLength = chunk.size();

if ( uncompress( (unsigned char *)&chunk.front(), &decompressedLength, chunkDataBytes, compressedSize ) != Z_OK
|| decompressedLength != chunk.size() ) {
size_t const size_or_err = ZSTD_decompressDCtx( zstd_dctx.get(),
chunk.data(),
decompressedLength,
chunkDataBytes,
compressedSize );

if ( ZSTD_isError( size_or_err ) || size_or_err != chunk.size() ) {
throw exFailedToDecompressChunk();
}
}
Expand Down
22 changes: 21 additions & 1 deletion src/chunkedstorage.hh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "file.hh"

#include <vector>
#include <stdint.h>
#include <memory>

#include <zstd.h>

/// A chunked compression storage. We use this for articles' bodies. The idea
/// is to store data in a separately-compressed chunks, much like in dictzip,
Expand All @@ -27,6 +29,19 @@ DEF_EX( exAddressOutOfRange, "The given chunked address is out of range", Ex )
DEF_EX( exFailedToDecompressChunk, "Failed to decompress a chunk", Ex )
DEF_EX( mapFailed, "Failed to map/unmap the file", Ex )

struct zstd_deleter
{
void operator()( ZSTD_DCtx * Ctx ) const
{
ZSTD_freeDCtx( Ctx );
}

void operator()( ZSTD_CCtx * Ctx ) const
{
ZSTD_freeCCtx( Ctx );
}
};

/// This class writes data blocks in chunks.
class Writer
{
Expand Down Expand Up @@ -66,6 +81,8 @@ private:
size_t bufferUsed;

void saveCurrentChunk();

std::unique_ptr<ZSTD_CCtx,zstd_deleter> zstd_cctx;
};

/// This class reads data blocks previously written by Writer.
Expand All @@ -83,6 +100,9 @@ public:
/// Uses the user-provided storage to load the entire chunk, and then to
/// return a pointer to the requested block inside it.
char * getBlock( uint32_t address, vector< char > & );

private:
std::unique_ptr<ZSTD_DCtx,zstd_deleter> zstd_dctx;
};

} // namespace ChunkedStorage
Expand Down
3 changes: 2 additions & 1 deletion src/dict/mdx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1437,7 +1437,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f

auto index_finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> c = index_finish - index_begin;
qDebug() << c;
qDebug() << c.count()<<"ms";
//or qDebug() << c; for qt6.6

t = t - 1;
if ( t > 0 ) {
Expand Down

0 comments on commit 7a5a5d8

Please sign in to comment.