From 7a5a5d8514ce388fba208c3758220dc2c555a236 Mon Sep 17 00:00:00 2001
From: shenleban tongying <shenlebantongying@gmail.com>
Date: Sun, 24 Mar 2024 16:01:34 -0400
Subject: [PATCH] feat: use Zstd for index file compression instead of zlib

---
 CMake_Unix.cmake      |  1 +
 src/btreeidx.cc       | 35 +++++++++++++++++++++++------------
 src/btreeidx.hh       | 24 +++++++++++++++++++++---
 src/chunkedstorage.cc | 32 ++++++++++++++++++++++----------
 src/chunkedstorage.hh | 22 +++++++++++++++++++++-
 src/dict/mdx.cc       |  3 ++-
 6 files changed, 90 insertions(+), 27 deletions(-)
diff --git a/CMake_Unix.cmake b/CMake_Unix.cmake
index db066f1980..bce28da987 100644
--- a/CMake_Unix.cmake
+++ b/CMake_Unix.cmake
@@ -40,6 +40,7 @@ pkg_check_modules(PKGCONFIG_DEPS IMPORTED_TARGET
         vorbis # .ogg
         vorbisfile
         liblzma
+        libzstd
         xapian-core
 )
 
diff --git a/src/btreeidx.cc b/src/btreeidx.cc
index 22132dc3d5..5e7e27f2a6 100644
--- a/src/btreeidx.cc
+++ b/src/btreeidx.cc
@@ -4,8 +4,7 @@
 #include "btreeidx.hh"
 #include "folding.hh"
 #include "utf8.hh"
-#include <QRunnable>
-#include <QThreadPool>
+
 #include <QSemaphore>
 #include <math.h>
 #include <string.h>
@@ -14,7 +13,6 @@
 #include "wstring_qt.hh"
 #include "utils.hh"
 
-#include <QRegularExpression>
 #include "wildcard.hh"
 #include "globalbroadcaster.hh"
 
@@ -36,6 +34,7 @@ BtreeIndex::BtreeIndex():
   idxFile( nullptr ),
   rootNodeLoaded( false )
 {
+  zstd_dctx.reset( ZSTD_createDCtx() );
 }
 
 BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ):
@@ -411,10 +410,15 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out )
 
   unsigned long decompressedLength = out.size();
 
-  if ( uncompress( (unsigned char *)&out.front(), &decompressedLength, &compressedData.front(), compressedData.size() )
-         != Z_OK
-       || decompressedLength != out.size() )
+  const size_t size_or_err = ZSTD_decompressDCtx( zstd_dctx.get(),
+                                                  out.data(),
+                                                  decompressedLength,
+                                                  compressedData.data(),
+                                                  compressedData.size() );
+
+  if ( ZSTD_isError( size_or_err ) || size_or_err != out.size() ) {
     throw exFailedToDecompressNode();
+  }
 }
 
 char const * BtreeIndex::findChainOffsetExactOrPrefix(
@@ -758,6 +762,10 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
                                 size_t maxElements,
                                 uint32_t & lastLeafLinkOffset )
 {
+
+  std::unique_ptr< ZSTD_CCtx, zstd_deleter > zstd_cctx;
+  zstd_cctx.reset( ZSTD_createCCtx() );
+
   // We compress all the node data. This buffer would hold it.
   vector< unsigned char > uncompressedData;
 
@@ -846,12 +854,15 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
   }
 
   // Save the result.
-  vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) );
+  vector< unsigned char > compressedData( ZSTD_compressBound( uncompressedData.size() ) );
 
-  unsigned long compressedSize = compressedData.size();
+  const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(),
+                                             compressedData.data(),
+                                             compressedData.size(),
+                                             uncompressedData.data(),
+                                             uncompressedData.size() );
 
-  if ( compress( &compressedData.front(), &compressedSize, &uncompressedData.front(), uncompressedData.size() )
-       != Z_OK ) {
+  if ( ZSTD_isError( size_or_err ) ) {
     qFatal( "Failed to compress btree node." );
     abort();
   }
@@ -859,8 +870,8 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
   uint32_t offset = file.tell();
 
   file.write< uint32_t >( uncompressedData.size() );
-  file.write< uint32_t >( compressedSize );
-  file.write( &compressedData.front(), compressedSize );
+  file.write< uint32_t >( size_or_err );
+  file.write( &compressedData.front(), size_or_err );
 
   if ( isLeaf ) {
     // A link to the next leef, which is zero and which will be updated
diff --git a/src/btreeidx.hh b/src/btreeidx.hh
index 77c905460f..15c5e47d3d 100644
--- a/src/btreeidx.hh
+++ b/src/btreeidx.hh
@@ -12,12 +12,13 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include <memory>
 
 #include <QFuture>
-#include <QList>
-#include <QSet>
 #include <QVector>
 
+#include <zstd.h>
+
 
 /// A base for the dictionary which creates a btree index to look up
 /// the words.
@@ -28,11 +29,25 @@ using gd::wstring;
 using std::vector;
 using std::map;
 
+struct zstd_deleter
+{
+  void operator()( ZSTD_DCtx * Ctx ) const
+  {
+    ZSTD_freeDCtx( Ctx );
+  }
+
+  void operator()( ZSTD_CCtx * Ctx ) const
+  {
+    ZSTD_freeCCtx( Ctx );
+  }
+};
+
+
 enum {
   /// This is to be bumped up each time the internal format changes.
   /// The value isn't used here by itself, it is supposed to be added
   /// to each dictionary's internal format version.
-  FormatVersion = 4
+  FormatVersion = 5
 };
 
 // These exceptions which might be thrown during the index traversal
@@ -139,6 +154,9 @@ protected:
 
 protected:
 
+  std::unique_ptr< ZSTD_DCtx, zstd_deleter > zstd_dctx;
+
+  // Lifetime of 2 var below is not managed by this class.
   QMutex * idxFileMutex;
   File::Class * idxFile;
 
diff --git a/src/chunkedstorage.cc b/src/chunkedstorage.cc
index 9a33e7a4a7..1087a90677 100644
--- a/src/chunkedstorage.cc
+++ b/src/chunkedstorage.cc
@@ -2,10 +2,8 @@
  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
 
 #include "chunkedstorage.hh"
-#include <zlib.h>
+#include <zstd.h>
 #include <string.h>
-#include <QDataStream>
-#include <QScopeGuard>
 #include <QMutexLocker>
 
 namespace ChunkedStorage {
@@ -19,6 +17,8 @@ Writer::Writer( File::Class & f ):
   chunkStarted( false ),
   bufferUsed( 0 )
 {
+  zstd_cctx.reset( ZSTD_createCCtx() );
+
   // Create a sratchpad at the beginning of file. We use it to write chunk
   // table if it would fit, in order to save some seek times.
 
@@ -64,21 +64,26 @@ void Writer::addToBlock( void const * data, size_t size )
 
 void Writer::saveCurrentChunk()
 {
-  size_t maxCompressedSize = compressBound( bufferUsed );
+  size_t maxCompressedSize = ZSTD_compressBound( bufferUsed );
 
   if ( bufferCompressed.size() < maxCompressedSize )
     bufferCompressed.resize( maxCompressedSize );
 
-  unsigned long compressedSize = bufferCompressed.size();
 
-  if ( compress( &bufferCompressed.front(), &compressedSize, &buffer.front(), bufferUsed ) != Z_OK )
+  const size_t size_or_err = ZSTD_compress2( zstd_cctx.get(),
+                                             bufferCompressed.data(),
+                                             bufferCompressed.size(),
+                                             buffer.data(),
+                                             bufferUsed );
+  if ( ZSTD_isError( size_or_err ) ) {
     throw exFailedToCompressChunk();
+  }
 
   offsets.push_back( file.tell() );
 
   file.write( (uint32_t)bufferUsed );
-  file.write( (uint32_t)compressedSize );
-  file.write( &bufferCompressed.front(), compressedSize );
+  file.write( (uint32_t)size_or_err );
+  file.write( &bufferCompressed.front(), size_or_err );
 
   bufferUsed = 0;
 
@@ -118,6 +123,8 @@ uint32_t Writer::finish()
 Reader::Reader( File::Class & f, uint32_t offset ):
   file( f )
 {
+  zstd_dctx.reset( ZSTD_createDCtx() );
+
   file.seek( offset );
 
   uint32_t size = file.read< uint32_t >();
@@ -165,8 +172,13 @@ char * Reader::getBlock( uint32_t address, vector< char > & chunk )
 
     unsigned long decompressedLength = chunk.size();
 
-    if ( uncompress( (unsigned char *)&chunk.front(), &decompressedLength, chunkDataBytes, compressedSize ) != Z_OK
-         || decompressedLength != chunk.size() ) {
+    size_t const size_or_err = ZSTD_decompressDCtx( zstd_dctx.get(),
+                                                    chunk.data(),
+                                                    decompressedLength,
+                                                    chunkDataBytes,
+                                                    compressedSize );
+
+    if ( ZSTD_isError( size_or_err ) || size_or_err != chunk.size() ) {
       throw exFailedToDecompressChunk();
     }
   }
diff --git a/src/chunkedstorage.hh b/src/chunkedstorage.hh
index 76e958ed5b..baae7b04d3 100644
--- a/src/chunkedstorage.hh
+++ b/src/chunkedstorage.hh
@@ -8,7 +8,9 @@
 #include "file.hh"
 
 #include <vector>
-#include <stdint.h>
+#include <memory>
+
+#include <zstd.h>
 
 /// A chunked compression storage. We use this for articles' bodies. The idea
 /// is to store data in a separately-compressed chunks, much like in dictzip,
@@ -27,6 +29,19 @@ DEF_EX( exAddressOutOfRange, "The given chunked address is out of range", Ex )
 DEF_EX( exFailedToDecompressChunk, "Failed to decompress a chunk", Ex )
 DEF_EX( mapFailed, "Failed to map/unmap the file", Ex )
 
+struct zstd_deleter
+{
+  void operator()( ZSTD_DCtx * Ctx ) const
+  {
+    ZSTD_freeDCtx( Ctx );
+  }
+
+  void operator()( ZSTD_CCtx * Ctx ) const
+  {
+    ZSTD_freeCCtx( Ctx );
+  }
+};
+
 /// This class writes data blocks in chunks.
 class Writer
 {
@@ -66,6 +81,8 @@ private:
   size_t bufferUsed;
 
   void saveCurrentChunk();
+
+  std::unique_ptr<ZSTD_CCtx,zstd_deleter> zstd_cctx;
 };
 
 /// This class reads data blocks previously written by Writer.
@@ -83,6 +100,9 @@ public:
   /// Uses the user-provided storage to load the entire chunk, and then to
   /// return a pointer to the requested block inside it.
   char * getBlock( uint32_t address, vector< char > & );
+
+private:
+  std::unique_ptr<ZSTD_DCtx,zstd_deleter> zstd_dctx;
 };
 
 } // namespace ChunkedStorage
diff --git a/src/dict/mdx.cc b/src/dict/mdx.cc
index a4834cf166..bdc9ea74f2 100644
--- a/src/dict/mdx.cc
+++ b/src/dict/mdx.cc
@@ -1437,7 +1437,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
 
       auto index_finish = std::chrono::high_resolution_clock::now();
       std::chrono::duration<double, std::milli> c = index_finish - index_begin;
-      qDebug() << c;
+      qDebug() << c.count()<<"ms";
+      //or qDebug() << c; for qt6.6
 
       t = t - 1;
       if ( t > 0 ) {