Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

experiment: switch from zlib to Zstd for index file #1438

Draft
wants to merge 3 commits into
base: staged
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMake_Unix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pkg_check_modules(PKGCONFIG_DEPS IMPORTED_TARGET
vorbis # .ogg
vorbisfile
liblzma
libzstd
xapian-core
)

Expand Down
3 changes: 2 additions & 1 deletion goldendict.pro
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ UI_DIR = build
MOC_DIR = build
RCC_DIR = build
LIBS += -lbz2 \
-llzo2
-llzo2 \
-lzstd

win32{
Debug: LIBS+= -L$$PWD/winlibs/lib/dbg/ -lzlibd
Expand Down
32 changes: 19 additions & 13 deletions src/btreeidx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include "btreeidx.hh"
#include "folding.hh"
#include "utf8.hh"
#include <QRunnable>
#include <QThreadPool>

#include <QSemaphore>
#include <math.h>
#include <string.h>
Expand All @@ -14,7 +13,6 @@
#include "wstring_qt.hh"
#include "utils.hh"

#include <QRegularExpression>
#include "wildcard.hh"
#include "globalbroadcaster.hh"

Expand All @@ -36,6 +34,7 @@ BtreeIndex::BtreeIndex():
idxFile( nullptr ),
rootNodeLoaded( false )
{
zstd_dctx.reset( ZSTD_createDCtx() );
}

BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ):
Expand Down Expand Up @@ -409,12 +408,12 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out )

idxFile->read( &compressedData.front(), compressedData.size() );

unsigned long decompressedLength = out.size();
const size_t size_or_err =
ZSTD_decompressDCtx( zstd_dctx.get(), out.data(), out.size(), compressedData.data(), compressedData.size() );

if ( uncompress( (unsigned char *)&out.front(), &decompressedLength, &compressedData.front(), compressedData.size() )
!= Z_OK
|| decompressedLength != out.size() )
if ( ZSTD_isError( size_or_err ) || size_or_err != out.size() ) {
throw exFailedToDecompressNode();
}
}

char const * BtreeIndex::findChainOffsetExactOrPrefix(
Expand Down Expand Up @@ -758,6 +757,10 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
size_t maxElements,
uint32_t & lastLeafLinkOffset )
{

std::unique_ptr< ZSTD_CCtx, ZSTD::deleter > zstd_cctx;
zstd_cctx.reset( ZSTD_createCCtx() );

// We compress all the node data. This buffer would hold it.
vector< unsigned char > uncompressedData;

Expand Down Expand Up @@ -846,21 +849,24 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
}

// Save the result.
vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) );
vector< unsigned char > compressedData( ZSTD_compressBound( uncompressedData.size() ) );

unsigned long compressedSize = compressedData.size();
const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(),
compressedData.data(),
compressedData.size(),
uncompressedData.data(),
uncompressedData.size() );

if ( compress( &compressedData.front(), &compressedSize, &uncompressedData.front(), uncompressedData.size() )
!= Z_OK ) {
if ( ZSTD_isError( size_or_err ) ) {
qFatal( "Failed to compress btree node." );
abort();
}

uint32_t offset = file.tell();

file.write< uint32_t >( uncompressedData.size() );
file.write< uint32_t >( compressedSize );
file.write( &compressedData.front(), compressedSize );
file.write< uint32_t >( size_or_err );
file.write( &compressedData.front(), size_or_err );

if ( isLeaf ) {
// A link to the next leef, which is zero and which will be updated
Expand Down
11 changes: 8 additions & 3 deletions src/btreeidx.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
#include <stdint.h>
#include <string>
#include <vector>
#include <memory>

#include <QFuture>
#include <QList>
#include <QSet>
#include <QVector>

#include "zstd_wrapper.hh"


/// A base for the dictionary which creates a btree index to look up
/// the words.
Expand All @@ -28,11 +29,12 @@ using gd::wstring;
using std::vector;
using std::map;


enum {
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 4
FormatVersion = 5
};

// These exceptions which might be thrown during the index traversal
Expand Down Expand Up @@ -139,6 +141,9 @@ protected:

protected:

std::unique_ptr< ZSTD_DCtx, ZSTD::deleter > zstd_dctx;

// Lifetime of 2 var below is not managed by this class.
QMutex * idxFileMutex;
File::Class * idxFile;

Expand Down
29 changes: 18 additions & 11 deletions src/chunkedstorage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */

#include "chunkedstorage.hh"
#include <zlib.h>
#include <zstd.h>
#include <string.h>

#include <QDataStream>
#include <QScopeGuard>
#include <QMutexLocker>
#include <QScopeGuard>

namespace ChunkedStorage {

Expand All @@ -19,6 +20,8 @@ Writer::Writer( File::Class & f ):
chunkStarted( false ),
bufferUsed( 0 )
{
zstd_cctx.reset( ZSTD_createCCtx() );

// Create a sratchpad at the beginning of file. We use it to write chunk
// table if it would fit, in order to save some seek times.

Expand Down Expand Up @@ -64,21 +67,22 @@ void Writer::addToBlock( void const * data, size_t size )

void Writer::saveCurrentChunk()
{
size_t maxCompressedSize = compressBound( bufferUsed );

if ( bufferCompressed.size() < maxCompressedSize )
if ( size_t maxCompressedSize = ZSTD_compressBound( bufferUsed ); bufferCompressed.size() < maxCompressedSize )
bufferCompressed.resize( maxCompressedSize );

unsigned long compressedSize = bufferCompressed.size();
const size_t size_or_err =
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

API design is different.

In zlib,

compress will write the size written to its 2nd paramater

In Zstd,

compress will return the size written or error code. facebook/zstd#1825 (comment)

ZSTD_compress2( zstd_cctx.get(), bufferCompressed.data(), bufferCompressed.size(), buffer.data(), bufferUsed );

if ( compress( &bufferCompressed.front(), &compressedSize, &buffer.front(), bufferUsed ) != Z_OK )
if ( ZSTD_isError( size_or_err ) ) {
throw exFailedToCompressChunk();
}

offsets.push_back( file.tell() );

file.write( (uint32_t)bufferUsed );
file.write( (uint32_t)compressedSize );
file.write( &bufferCompressed.front(), compressedSize );
file.write( (uint32_t)size_or_err );
file.write( bufferCompressed.data(), size_or_err );

bufferUsed = 0;

Expand Down Expand Up @@ -118,6 +122,8 @@ uint32_t Writer::finish()
Reader::Reader( File::Class & f, uint32_t offset ):
file( f )
{
zstd_dctx.reset( ZSTD_createDCtx() );

file.seek( offset );

uint32_t size = file.read< uint32_t >();
Expand Down Expand Up @@ -163,10 +169,11 @@ char * Reader::getBlock( uint32_t address, vector< char > & chunk )
} );
Q_UNUSED( autoUnmap )

unsigned long decompressedLength = chunk.size();

if ( uncompress( (unsigned char *)&chunk.front(), &decompressedLength, chunkDataBytes, compressedSize ) != Z_OK
|| decompressedLength != chunk.size() ) {
size_t const size_or_err =
ZSTD_decompressDCtx( zstd_dctx.get(), chunk.data(), chunk.size(), chunkDataBytes, compressedSize );

if ( ZSTD_isError( size_or_err ) || size_or_err != chunk.size() ) {
throw exFailedToDecompressChunk();
}
}
Expand Down
9 changes: 8 additions & 1 deletion src/chunkedstorage.hh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "file.hh"

#include <vector>
#include <stdint.h>
#include <memory>

#include "zstd_wrapper.hh"

/// A chunked compression storage. We use this for articles' bodies. The idea
/// is to store data in a separately-compressed chunks, much like in dictzip,
Expand Down Expand Up @@ -66,6 +68,8 @@ private:
size_t bufferUsed;

void saveCurrentChunk();

std::unique_ptr< ZSTD_CCtx, ZSTD::deleter > zstd_cctx;
};

/// This class reads data blocks previously written by Writer.
Expand All @@ -83,6 +87,9 @@ public:
/// Uses the user-provided storage to load the entire chunk, and then to
/// return a pointer to the requested block inside it.
char * getBlock( uint32_t address, vector< char > & );

private:
std::unique_ptr< ZSTD_DCtx, ZSTD::deleter > zstd_dctx;
};

} // namespace ChunkedStorage
Expand Down
18 changes: 15 additions & 3 deletions src/dict/mdx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1270,11 +1270,13 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f

string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId;

if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( dictFiles, indexFile ) ) {
int t = 20;
mdxbench:
if ( true ) {
// Building the index

gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() );
// gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() );
auto index_begin = std::chrono::high_resolution_clock::now();

MdictParser parser;
list< sptr< MdictParser > > mddParsers;
Expand Down Expand Up @@ -1432,6 +1434,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f

idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );

auto index_finish = std::chrono::high_resolution_clock::now();
std::chrono::duration< double, std::milli > c = index_finish - index_begin;
qDebug() << c.count() << "ms";
//or qDebug() << c; for qt6.6

t = t - 1;
if ( t > 0 ) {
goto mdxbench;
}
}

dictionaries.push_back( std::make_shared< MdxDictionary >( dictId, indexFile, dictFiles ) );
Expand Down
20 changes: 20 additions & 0 deletions src/zstd_wrapper.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <zstd.h>

namespace ZSTD {

struct deleter
{
void operator()( ZSTD_DCtx * Ctx ) const
{
ZSTD_freeDCtx( Ctx );
}

void operator()( ZSTD_CCtx * Ctx ) const
{
ZSTD_freeCCtx( Ctx );
}
};

} // namespace ZSTD
Loading