From 266f3414053ae7e42b3e664367cc5befefd8cc97 Mon Sep 17 00:00:00 2001
From: mourisl <mourisl@hotmail.com>
Date: Sat, 18 Nov 2023 19:31:14 -0500
Subject: [PATCH] Add back compactds as source code instead of submodule

---
 compactds/Alphabet.hpp                        |  196 ++
 compactds/Bitvector.hpp                       |   70 +
 compactds/Bitvector_Compressed.hpp            |  370 ++++
 compactds/Bitvector_Plain.hpp                 |  206 ++
 compactds/Bitvector_RunLength.hpp             |  228 ++
 compactds/Bitvector_Sparse.hpp                |  336 +++
 compactds/CompactMapper.hpp                   |  130 ++
 compactds/CompressedSuffixArray.hpp           |  154 ++
 compactds/DS_InvPermutation.hpp               |  137 ++
 compactds/DS_Parenthesis.hpp                  |  150 ++
 compactds/DS_PatternRankSelect.hpp            |  272 +++
 compactds/DS_RangeMinMaxTree.hpp              |  920 ++++++++
 compactds/DS_Rank.hpp                         |  298 +++
 compactds/DS_Select.hpp                       |  753 +++++++
 compactds/DS_Select_Test.hpp                  |  537 +++++
 compactds/DifferenceCover.hpp                 |  201 ++
 compactds/EliasCode.hpp                       |   74 +
 compactds/FMBuilder.hpp                       |  504 +++++
 compactds/FMIndex.hpp                         |  491 +++++
 compactds/FixedSizeElemArray.hpp              |  322 +++
 compactds/FractionBitElemArray.hpp            |  118 ++
 compactds/HuffmanCode.hpp                     |  230 ++
 compactds/InterleavedFixedSizeElemArray.hpp   |  238 +++
 compactds/InvertedIndex.hpp                   |  131 ++
 compactds/Makefile                            |   31 +
 compactds/PartialSum.hpp                      |  140 ++
 compactds/PerfectHash.hpp                     |  199 ++
 compactds/Permutation.hpp                     |  237 +++
 compactds/Sequence.hpp                        |   48 +
 compactds/SequenceCompactor.hpp               |   76 +
 compactds/Sequence_Hybrid.hpp                 |  328 +++
 compactds/Sequence_Permutation.hpp            |   70 +
 compactds/Sequence_Plain.hpp                  |  101 +
 compactds/Sequence_RunBlock.hpp               |  363 ++++
 compactds/Sequence_RunLength.hpp              |  191 ++
 compactds/Sequence_WaveletTree.hpp            |  338 +++
 compactds/SimpleVector.hpp                    |  388 ++++
 compactds/SuffixArrayGenerator.hpp            |  725 +++++++
 compactds/Tree.hpp                            |  167 ++
 compactds/Tree_BP.hpp                         |  316 +++
 compactds/Tree_Cardinal.hpp                   |   43 +
 compactds/Tree_Cardinal_LOUDS.hpp             |  203 ++
 compactds/Tree_Cardinal_Ordinal.hpp           |  167 ++
 compactds/Tree_Cardinal_Plain.hpp             |  255 +++
 compactds/Tree_DFUDS.hpp                      |  283 +++
 compactds/Tree_LOUDS.hpp                      |  171 ++
 compactds/Tree_Labeled.hpp                    |  256 +++
 compactds/Tree_Plain.hpp                      |  277 +++
 compactds/UniversalHashGenerator.hpp          |   81 +
 compactds/Utils.hpp                           |  292 +++
 compactds/VariableSizeElemArray.hpp           |   33 +
 .../VariableSizeElemArray_DensePointers.hpp   |  144 ++
 .../VariableSizeElemArray_DirectAccess.hpp    |   76 +
 .../VariableSizeElemArray_SampledPointers.hpp |  118 ++
 compactds/bitvector_benchmark.cpp             |  162 ++
 compactds/notes.md                            |    7 +
 compactds/rbbwt.cpp                           |  148 ++
 compactds/test.cpp                            | 1881 +++++++++++++++++
 58 files changed, 15381 insertions(+)
 create mode 100644 compactds/Alphabet.hpp
 create mode 100644 compactds/Bitvector.hpp
 create mode 100644 compactds/Bitvector_Compressed.hpp
 create mode 100644 compactds/Bitvector_Plain.hpp
 create mode 100644 compactds/Bitvector_RunLength.hpp
 create mode 100644 compactds/Bitvector_Sparse.hpp
 create mode 100644 compactds/CompactMapper.hpp
 create mode 100644 compactds/CompressedSuffixArray.hpp
 create mode 100644 compactds/DS_InvPermutation.hpp
 create mode 100644 compactds/DS_Parenthesis.hpp
 create mode 100644 compactds/DS_PatternRankSelect.hpp
 create mode 100644 compactds/DS_RangeMinMaxTree.hpp
 create mode 100644 compactds/DS_Rank.hpp
 create mode 100644 compactds/DS_Select.hpp
 create mode 100644 compactds/DS_Select_Test.hpp
 create mode 100644 compactds/DifferenceCover.hpp
 create mode 100644 compactds/EliasCode.hpp
 create mode 100644 compactds/FMBuilder.hpp
 create mode 100644 compactds/FMIndex.hpp
 create mode 100644 compactds/FixedSizeElemArray.hpp
 create mode 100644 compactds/FractionBitElemArray.hpp
 create mode 100644 compactds/HuffmanCode.hpp
 create mode 100644 compactds/InterleavedFixedSizeElemArray.hpp
 create mode 100644 compactds/InvertedIndex.hpp
 create mode 100644 compactds/Makefile
 create mode 100644 compactds/PartialSum.hpp
 create mode 100644 compactds/PerfectHash.hpp
 create mode 100644 compactds/Permutation.hpp
 create mode 100644 compactds/Sequence.hpp
 create mode 100644 compactds/SequenceCompactor.hpp
 create mode 100644 compactds/Sequence_Hybrid.hpp
 create mode 100644 compactds/Sequence_Permutation.hpp
 create mode 100644 compactds/Sequence_Plain.hpp
 create mode 100644 compactds/Sequence_RunBlock.hpp
 create mode 100644 compactds/Sequence_RunLength.hpp
 create mode 100644 compactds/Sequence_WaveletTree.hpp
 create mode 100644 compactds/SimpleVector.hpp
 create mode 100644 compactds/SuffixArrayGenerator.hpp
 create mode 100644 compactds/Tree.hpp
 create mode 100644 compactds/Tree_BP.hpp
 create mode 100644 compactds/Tree_Cardinal.hpp
 create mode 100644 compactds/Tree_Cardinal_LOUDS.hpp
 create mode 100644 compactds/Tree_Cardinal_Ordinal.hpp
 create mode 100644 compactds/Tree_Cardinal_Plain.hpp
 create mode 100644 compactds/Tree_DFUDS.hpp
 create mode 100644 compactds/Tree_LOUDS.hpp
 create mode 100644 compactds/Tree_Labeled.hpp
 create mode 100644 compactds/Tree_Plain.hpp
 create mode 100644 compactds/UniversalHashGenerator.hpp
 create mode 100644 compactds/Utils.hpp
 create mode 100644 compactds/VariableSizeElemArray.hpp
 create mode 100644 compactds/VariableSizeElemArray_DensePointers.hpp
 create mode 100644 compactds/VariableSizeElemArray_DirectAccess.hpp
 create mode 100644 compactds/VariableSizeElemArray_SampledPointers.hpp
 create mode 100644 compactds/bitvector_benchmark.cpp
 create mode 100644 compactds/notes.md
 create mode 100644 compactds/rbbwt.cpp
 create mode 100644 compactds/test.cpp

diff --git a/compactds/Alphabet.hpp b/compactds/Alphabet.hpp
new file mode 100644
index 0000000..9c16346
--- /dev/null
+++ b/compactds/Alphabet.hpp
@@ -0,0 +1,196 @@
+#ifndef _MOURISL_COMPACTDS_DS_ALPHABET
+#define _MOURISL_COMPACTDS_DS_ALPHABET
+
+#include "Utils.hpp"
+#include "HuffmanCode.hpp"
+#include "FixedSizeElemArray.hpp"
+
+typedef char ALPHABET ;
+
+#define ALPHABET_CODE_NOCODE 0
+#define ALPHABET_CODE_PLAIN 1
+#define ALPHABET_CODE_HUFFMAN 2
+
+// The data structe for mapping alphabet
+// Conceptually, all the other data structure regard the alphabet as {0,...,|sigma|-1},
+// This function serves to map these numeric alphabet to actually alphabet(char by default).
+namespace compactds {
+class Alphabet
+{
+private:
+  size_t _space ;
+  int _method ;  
+  ALPHABET *_alphabetList ;    
+  int _alphabetCode[1<<(sizeof(ALPHABET) * 8)] ;
+  short _alphabetCodeLen[1<<(sizeof(ALPHABET) * 8)] ; // the length of encoded bits.
+  size_t _n ;
+
+  HuffmanCode huffmanCode ; 
+public:
+  Alphabet() 
+  {
+    _n = _space = 0 ;
+    _method = ALPHABET_CODE_NOCODE ;
+  }
+
+  ~Alphabet() { Free() ; }
+
+  void Free()
+  {
+    if (_n != 0)
+      free(_alphabetList) ;
+  }
+  
+  size_t GetSpace() { return _space + sizeof(*this); } 
+
+  // Use plain binary number sequentially for the characters in s.
+  // @return: code length
+  int InitFromList(const ALPHABET *s, size_t n)
+  {
+    size_t i ;
+    this->_n = n ;
+    _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * n) ;
+    _space = sizeof(ALPHABET) * n ;  
+    memset(_alphabetCode, 0, sizeof(_alphabetCode)) ;
+    memset(_alphabetCodeLen, 0, sizeof(_alphabetCodeLen)) ;
+    
+    int codeLen = Utils::Log2Ceil(n) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      _alphabetList[i] = s[i] ;
+      _alphabetCode[ (int)s[i] ]= i ;
+      _alphabetCodeLen[ (int)s[i] ] = codeLen ; 
+    }
+    _method = ALPHABET_CODE_PLAIN ;
+    return codeLen ;
+  }
+
+  // s: list of the characters
+  // freq: list of the frequencies for each character
+  // n: number of character
+  void InitHuffman(const ALPHABET *s, const uint64_t *freq, size_t n)
+  {
+    size_t i ;
+    this->_n = n ;
+    _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * n) ;
+    for (i = 0 ; i < n ; ++i)
+      _alphabetList[i] = s[i] ;
+   
+    huffmanCode.InitFromFrequency(freq, n) ;
+
+    for (i = 0 ; i < n ; ++i)
+    {
+      int l ;
+      _alphabetCode[i] = huffmanCode.Encode(i, l) ;
+      _alphabetCodeLen[i] = l ;
+    }
+    _method = ALPHABET_CODE_HUFFMAN ;
+  }
+
+  size_t GetAlphabetCapacity() const
+  {
+    if (ALPHABET_CODE_NOCODE)
+      return 0 ;
+    else if (ALPHABET_CODE_PLAIN)
+      return 1<<(Utils::Log2Ceil(_n)) ;
+    else if (ALPHABET_CODE_HUFFMAN)
+      return _n ;
+    return 0 ;
+  }
+
+  size_t GetSize() const
+  {
+    return _n ;
+  }
+  
+  // l: how many bits used in the coding
+  ALPHABET Decode(WORD c, int l) const
+  {
+    //l = _alphabetCodeLen[ (int)_alphabetList[i] ] ;
+    size_t i ;
+    if (_method == ALPHABET_CODE_NOCODE)
+    {
+      return c ;
+    }
+
+    if (_method == ALPHABET_CODE_PLAIN)
+      i = c ;
+    else
+      i = huffmanCode.Decode(c, l) ;
+    return _alphabetList[i] ;
+  }
+
+  WORD Encode(ALPHABET c, int &l) const
+  {
+    if (_method == ALPHABET_CODE_NOCODE)
+    {
+      //l = Utils::CountBits(c) ;
+      l = 0 ;
+      return c ;
+    }
+    else
+    {
+      l = _alphabetCodeLen[(int)c] ;
+      return _alphabetCode[(int)c] ;
+    }
+  }
+  
+  WORD Encode(ALPHABET c) const
+  {
+    if (_method == ALPHABET_CODE_NOCODE)
+      return c ;
+    else
+      return _alphabetCode[(int)c] ;
+  }
+
+  // test whether the alphabet c is in the list
+  bool IsIn(ALPHABET c) const
+  {
+    size_t i ;
+    for (i = 0 ; i < _n ; ++i)
+      if (_alphabetList[i] == c)
+        return true ;
+    return false ;
+  }
+
+  Alphabet& operator=(const Alphabet &in)
+  {
+    Free() ;
+    _n = in._n ;
+    _space = in._space ;
+    _method = in._method ;
+
+    _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * _n) ;
+    _space = sizeof(ALPHABET) * _n ;  
+    memcpy(_alphabetList, in._alphabetList, sizeof(ALPHABET) * _n ) ;
+    memcpy(_alphabetCode, in._alphabetCode, sizeof(_alphabetCode)) ;
+    memcpy(_alphabetCodeLen, in._alphabetCodeLen, sizeof(_alphabetCodeLen)) ;
+    huffmanCode = in.huffmanCode ;
+    return *this ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _space) ;
+    SAVE_VAR(fp, _method) ;
+    SAVE_VAR(fp, _n) ;
+    fwrite(_alphabetList, sizeof(ALPHABET), _n, fp) ;    
+    fwrite(_alphabetCode, sizeof(_alphabetCode[0]), 1<<(sizeof(ALPHABET) * 8), fp) ;    
+    fwrite(_alphabetCodeLen, sizeof(_alphabetCodeLen[0]), 1<<(sizeof(ALPHABET) * 8), fp) ;    
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    LOAD_VAR(fp, _space) ;
+    LOAD_VAR(fp, _method) ;
+    LOAD_VAR(fp, _n) ;
+    
+    _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * _n) ;
+    fread(_alphabetList, sizeof(ALPHABET), _n, fp) ;    
+    fread(_alphabetCode, sizeof(_alphabetCode[0]), 1<<(sizeof(ALPHABET) * 8), fp) ;    
+    fread(_alphabetCodeLen, sizeof(_alphabetCodeLen[0]), 1<<(sizeof(ALPHABET) * 8), fp) ;    
+  }
+} ;
+}
+#endif 
diff --git a/compactds/Bitvector.hpp b/compactds/Bitvector.hpp
new file mode 100644
index 0000000..9757a32
--- /dev/null
+++ b/compactds/Bitvector.hpp
@@ -0,0 +1,70 @@
+#ifndef _MOURISL_COMPACTDS_BITVECTOR
+#define _MOURISL_COMPACTDS_BITVECTOR
+
+#include "Utils.hpp"
+
+#define BITVECTOR_DEFAULT_SELECT_SPEED 3
+
+// The overall functionality of bitvector
+namespace compactds {
+class Bitvector
+{
+protected:
+  size_t _space ;
+public:
+  Bitvector() {_space = 0 ;} 
+  ~Bitvector() {}
+  
+  // W is the plain bit vector
+  virtual void Init(const WORD *W, const size_t n) = 0 ; 
+  virtual void Free() = 0 ;
+  virtual size_t GetSpace() = 0;
+  
+  // Return the ith bits (0-based)
+  virtual int Access(size_t i) const = 0 ;
+  // Return the number of 1s before i
+  virtual size_t Rank1(size_t i, int inclusive = 1) const = 0 ;
+  // Return the index of th i-th (i is 1-based, so rank and select are inversible) 1
+  // it is for 1 only for now
+  virtual size_t Select(size_t i) const = 0 ;
+ 
+  // Return the rightmost 1 in [0..i]
+  // TODO: Handle the boundary cases
+  size_t Pred(size_t i) const
+  {
+    return Select( Rank1(i) ) ;
+  }
+
+  // Return the leftmost 1 in [i..n-1]
+  size_t Succ(size_t i) const
+  {
+    return Select( Rank1(i, /*inclusive=*/0) + 1 ) ;
+  }
+
+  // Return the number of 0s before i
+  size_t Rank0(size_t i, int inclusive = 1) const
+  {
+    // There are i+1 elements in [0..i], and Rank(i) of them are 1's
+    return i + inclusive - Rank1(i, inclusive) ;
+  }
+
+  size_t Rank(int type, size_t i, int inclusive = 1) const
+  {
+    if (type == 1)
+      return Rank1(i, inclusive) ;
+    else
+      return Rank0(i, inclusive) ;
+  }
+
+  virtual void Save(FILE *fp) 
+  {
+    SAVE_VAR(fp, _space) ;
+  }
+  
+  virtual void Load(FILE *fp) 
+  {
+    LOAD_VAR(fp, _space) ;
+  }
+} ;
+}
+#endif
diff --git a/compactds/Bitvector_Compressed.hpp b/compactds/Bitvector_Compressed.hpp
new file mode 100644
index 0000000..fe0d011
--- /dev/null
+++ b/compactds/Bitvector_Compressed.hpp
@@ -0,0 +1,370 @@
+#ifndef _MOURISL_COMPACTDS_BITVECTOR_COMPRESSED
+#define _MOURISL_COMPACTDS_BITVECTOR_COMPRESSED
+
+#include "Utils.hpp"
+#include "Bitvector.hpp"
+
+#include "FixedSizeElemArray.hpp"
+
+// The compressed bitvector based on chaptor 4
+// This seems to be the RRR bitvector
+namespace compactds {
+class Bitvector_Compressed : public Bitvector
+{
+private:
+  int _b ; // block size for bit vector
+  int _pb ; // block size for partial sum array _P
+  size_t _n ; // the total raw length of the bits  
+ 
+  // Variables for compress the bit vector
+  FixedSizeElemArray _C ; // the array for the count of bits
+  WORD *_O ; // encoded offsets within each block
+  size_t *_P ; // partial sum on offset array _O
+
+  uint64_t **_choose ; // _C(i, j)
+  int *_L ; // the required bits for _O for each _Ci
+
+  // Variables for ranking query
+  uint64_t *_R ; // precomputed rank (right-exclusive). R and _P are aligned
+
+  // Variables for selection
+  size_t *_S ; 
+  int _sb ; // the block size for selection
+  size_t _sBlockCnt ; 
+
+  int _selectSpeed ;
+
+  void EncodeBits(const WORD &B, int &c, size_t &o) const
+  {
+    int i ;
+    WORD maskedB = B & MASK(_b) ;
+    int onecnt = Utils::Popcount(maskedB) ;
+    o = 0 ;
+    c = 0 ;
+    for (i = _b - 1 ; i >= 0 ; --i)
+    {
+      if ((maskedB >> i) & 1)
+      {
+        o += _choose[i][onecnt - c] ;
+        ++c ;
+      }
+    }
+  }
+
+  WORD DecodeBits(int c, size_t o) const
+  {
+    WORD ret = 0 ;
+    int usedOnes = 0 ;
+    int i ;
+    for (i = _b - 1 ; i >= 0 ; --i)
+    {
+      ret <<= 1 ;
+      if (o >= _choose[i][c - usedOnes]) 
+      {
+        ret |= 1 ;
+        o -= _choose[i][c - usedOnes];
+        ++usedOnes ;
+      }
+    }
+
+    return ret ;
+  }
+
+  void InitChoose(int b)
+  {
+    int i, j ;
+
+    // Build the _choose array
+    _choose = (uint64_t**)malloc(sizeof(*_choose) * (b+1) ) ;
+    _space += sizeof(*_choose) * (b + 1) ;
+    for (i = 0 ; i <= b ; ++i)
+    {
+      _choose[i] = (uint64_t*)malloc(sizeof(**_choose) * (i + 2)) ;
+      _space += sizeof(**_choose) * (i + 2) ;
+    }
+    for (i = 0 ; i <= b ; ++i)
+    {
+      _choose[i][0] = 1 ;
+      for (j = 1 ; j < i ; ++j)
+      {
+        _choose[i][j] = _choose[i - 1][j - 1] + _choose[i - 1][j] ;
+      }
+      _choose[i][i] = 1 ;
+      _choose[i][i + 1] = 0 ;
+    }
+
+    _L = (int *)malloc(sizeof(*_L) * (b + 1)) ;
+    _space += sizeof(*_L) * (b + 1) ;
+    for (i = 0 ; i <= b ; ++i)
+    {
+      // There are _choose[b][i] different combinations for each _C_i
+      _L[i] = Utils::Log2Ceil(_choose[b][i]) ;
+    }
+  }
+
+public:
+  Bitvector_Compressed() 
+  {
+    _n = _b = _pb = _sb = 0 ;
+    _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ;
+  }
+  ~Bitvector_Compressed() {Free();}
+  
+  // blockSize should be 2^x - 1.so _C can be fully utilized
+  void SetBlockSize(int blockSize)
+  {
+    _b = blockSize ;
+  }
+
+  void SetPsumBlockSize(int psumBlockSize)
+  {
+    _pb = psumBlockSize ;
+  }
+
+  void SetSelectBlockSize(int selectBlockSize)
+  {
+    _sb = selectBlockSize ;
+  }
+
+  void SetSelectSpeed(int in)
+  {
+    _selectSpeed = in ;
+  }
+
+  // W is the plain bit vector
+  void Init(const WORD *W, const size_t n)
+  {
+    size_t i, j ;
+    this->_n = n ;
+    _space = 0 ;
+
+    if (_b <= 0)
+      _b = 8 * sizeof(WORD) - 1 ;
+
+    if (_pb <= 0)
+      _pb = 8 * sizeof(size_t) ;
+   
+    if (_sb <= _b)
+      _sb = 8 * sizeof(size_t) * 8 * sizeof(size_t) ;
+    size_t blockCnt = DIV_CEIL(n, _b) ;
+    InitChoose(_b) ; // Initialize _choose and _L
+   
+    _C.Malloc(Utils::Log2Ceil(_b + 1), blockCnt) ; 
+    _space += _C.GetSpace() ;
+
+    // _Calculate the size for _O 
+    size_t offsetsSize = 0 ;
+    int maxOneCntInBlock = 0 ;
+    uint64_t totalOneCnt = 0 ;
+    for (i = 0 ; i < _n ; i += _b)
+    {
+      int onecnt = Utils::Popcount( Utils::BitsRead(W, i, (i + _b < _n ? i + _b - 1 : _n - 1)) ) ;
+
+      offsetsSize += _L[onecnt] ;
+      totalOneCnt += onecnt ;
+      if (onecnt > maxOneCntInBlock)
+        maxOneCntInBlock = onecnt ;
+    }
+    
+    _O = Utils::MallocByBits(offsetsSize) ;
+    _space += Utils::BitsToWordBytes(offsetsSize) ; 
+  
+    size_t psumBlockCnt = DIV_CEIL(blockCnt, _pb) ;
+    _P = (size_t *)malloc(sizeof(size_t) * psumBlockCnt) ;
+    _space += sizeof(size_t) * psumBlockCnt ;
+    
+    _R = (uint64_t *)malloc(sizeof(uint64_t) * psumBlockCnt) ;
+    _space += sizeof(uint64_t) * psumBlockCnt ;
+
+    if (_selectSpeed > 0)
+    {
+      _sBlockCnt = DIV_CEIL(totalOneCnt, _sb) ;
+      _S = (size_t *)malloc(sizeof(size_t) * _sBlockCnt) ;
+      _space += sizeof(uint64_t) * _sBlockCnt ;
+    }
+
+    // Build the _C, _O, _P that compress the bit vector
+    // Also build the data structures for rank and selections
+    // j is used to index _O
+    size_t blocki ;
+    uint64_t onecntSum = 0 ;
+    bool locateFirstOne = false ;
+    for (i = 0, j = 0, blocki = 0 ; i < _n ; i += _b, ++blocki)
+    {
+      WORD bits = Utils::BitsRead(W, i, (i + _b < _n ? i + _b - 1 : _n - 1)) ;
+      
+      int tmpc ;
+      size_t tmpo ;
+      EncodeBits(bits, tmpc, tmpo) ;
+       
+      //printf("%d %llu. %llu\n", tmpc, tmpo, bits) ;  
+      if (blocki % _pb == 0)
+      {
+        _P[blocki/_pb] = j ;
+      }
+      _C.Write(blocki, tmpc) ;
+      if (_L[tmpc] > 0)
+      {
+        Utils::BitsWrite(_O, j, j + _L[tmpc] - 1, tmpo) ;
+      }
+      j += _L[tmpc] ;
+      
+      // _Process the information for rank operation
+      if (blocki % _pb == 0)
+        _R[blocki / _pb]  = onecntSum ;
+      
+      // _Process the information for select operation 
+      if (_selectSpeed && (onecntSum / _sb != (onecntSum + tmpc) / _sb 
+          || (!locateFirstOne && tmpc > 0)))
+      {
+        int localOneCnt = 0 ; 
+        int l = 0 ;
+        for (l = 0 ; l < _b ; ++l)
+          if ((bits >> l)&1)
+          {
+            if ((onecntSum + localOneCnt) % _sb == 0)
+            {
+              _S[(onecntSum + localOneCnt) / _sb] = i + l ;
+              break ;
+            }
+            ++localOneCnt ;
+          }
+        locateFirstOne = true ;
+      }  
+      onecntSum += tmpc ;
+    }
+  }
+
+  void Free()
+  {
+    _C.Free() ;
+    if (_n != 0)
+    {
+      int i ;
+      for (i = 0 ; i <= _b ; ++i)
+        free(_choose[i]) ;
+      free(_choose) ;
+      free(_L) ;
+      
+      free(_O) ;
+      free(_P) ;
+        
+      free(_R) ;
+
+      free(_S) ;
+      _n = 0 ;
+    }
+  }
+
+  // Return the ith bits (0-based)
+  int Access(size_t i) const
+  {
+    // Get the partial sum from _P
+    size_t bi = i / _b ;
+    size_t pi = bi / _pb ;
+
+    size_t j ; 
+    int blockc = _C.Read(bi) ;
+    if (blockc == 0)
+      return 0 ;
+    else if (blockc == _b)
+      return 1 ;
+    
+    size_t blocko ;
+    size_t os = _P[pi] ; // start position in o
+    // j to index the block offsets
+    for (j = pi * _pb ; j < bi ; ++j)
+      os += _L[ _C.Read(j) ] ;
+    blocko = Utils::BitsRead(_O, os, os + _L[blockc] - 1) ;
+
+    WORD bits = DecodeBits(blockc, blocko) ;
+
+    int residuali = i % _b ;
+    return (bits >> residuali) & 1 ;
+  }
+
+  // Return the _number of 1s before i
+  size_t Rank1(size_t i, int inclusive = 1) const
+  {
+    size_t j ;
+    size_t bi = i / _b ; // index for block
+    size_t ri = bi / _pb ; // index for R
+
+    size_t ret = _R[ri] ;
+    size_t os = _P[ri] ; 
+    for (j = ri * _pb ; j < bi ; ++j)
+    {
+      int onecnt = _C.Read(j) ;
+      ret += onecnt ;
+      os += _L[onecnt] ;
+    }
+    int blockc = _C.Read(bi) ;
+    if (blockc == 0)
+      return ret ;
+    else if (blockc == _b)
+      return ret + i%_b + inclusive ;
+    else
+    {
+      size_t blocko = Utils::BitsRead(_O, os, os + _L[blockc] - 1) ;
+
+      WORD bits = DecodeBits(blockc, blocko) ;
+
+      int residuali = i % _b ;
+      return ret + Utils::Popcount( bits & MASK_WCHECK(residuali + inclusive) ) ;
+    }
+  }
+
+  // Return the index of th i-th (1-based, so rank and select are inversible) 1
+  size_t Select(size_t i) const
+  {
+    if (i == 0)
+      return POSITIVE_INF ;
+    // Unlike the uncompressed case, binary search might be less efficient
+    // because we _need to sequentially find the appropriate _O in rank.
+    size_t j ;
+    size_t si = (i-1) / _sb ; 
+    size_t bi = _S[si] / _b ; // it aligns to block bi
+    size_t pi = bi / _pb ; // block bi belongs to the _P-block recording the offset in _O
+
+    size_t os = _P[pi] ;
+    // We rollback the index a little bit to align with the information of _O
+    uint64_t onecntSum = _R[pi] ; // Another bless that R and _P are aligned
+    // j index the block
+    for (j = pi * _pb ; j * _b < _n ; ++j)
+    {
+      int blockc = _C.Read(j) ;
+      if (onecntSum + blockc >= i)
+      {
+        // the desired 1 is in this block
+        size_t blocko = Utils::BitsRead(_O, os, os + _L[blockc] - 1) ; 
+        WORD bits = DecodeBits(blockc, blocko) ; 
+
+        int l ;
+        for (l = 0 ; l < _b ; ++l)
+          if ((bits >> l) & 1)
+          {
+            ++onecntSum ;
+            if (onecntSum == i)
+              return j * _b + l ;
+          }
+        break ;
+      }
+      os += _L[blockc] ; 
+      onecntSum += blockc ;
+    }
+    return 0 ;
+  }
+
+  size_t Select(int type, size_t i) const
+  {
+    return 0 ;
+  }
+
+  size_t GetSpace()
+  {
+    return _space + sizeof(*this) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Bitvector_Plain.hpp b/compactds/Bitvector_Plain.hpp
new file mode 100644
index 0000000..8e37f2d
--- /dev/null
+++ b/compactds/Bitvector_Plain.hpp
@@ -0,0 +1,206 @@
+#ifndef _MOURISL_COMPACTDS_BITVECTOR_PLAIN
+#define _MOURISL_COMPACTDS_BITVECTOR_PLAIN
+
+#include "Utils.hpp"
+#include "Bitvector.hpp"
+
+#include "DS_Rank.hpp"
+#include "DS_Select.hpp"
+
+// The bitvector with  
+namespace compactds {
+class Bitvector_Plain : public Bitvector
+{
+private:
+  size_t _n ; // the total raw length of the bits  
+  
+  // Variables for the bit vector
+  WORD *_B ; // bitvector packed in WORD array
+
+  // Variables for _ranking query
+  DS_Rank9 _rank ;
+  int _rb ; 
+
+  // Variables for _selection
+  DS_Select _select ;
+  int _sb ;
+
+  int _selectSpeed ;
+  int _selectTypeSupport ;
+
+public:
+  Bitvector_Plain() 
+  {
+    _n = _rb = _sb = 0 ;
+    _B = NULL ;
+    _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ;
+    _selectTypeSupport = 3 ;
+  }
+  ~Bitvector_Plain() {Free();}
+  
+  void SetRankBlockLength(int rBlockSize)
+  {
+    _rb = rBlockSize ;
+  }
+
+  void SetSelectBlockLength(int sBlockSize)
+  {
+    _sb = sBlockSize ;
+  }
+
+  void SetSelectSpeed(int selectSpeed)
+  {
+    this->_selectSpeed = selectSpeed ;
+  }
+
+  void SetSelectTypeSupport(int selectTypeSupport)
+  {
+    this->_selectTypeSupport = selectTypeSupport ;
+  }
+
+  
+  void Malloc(const size_t &n)
+  {
+    this->_n = n ;
+    _B = Utils::MallocByBits(n) ;
+    
+    _space = Utils::BitsToWordBytes(n) ;
+  }
+  
+  void Free()
+  {
+    if (_B != NULL)
+    {
+      free(_B) ;
+      _B = NULL ;
+    }
+    _rank.Free() ;
+    _select.Free() ;
+    _n = 0 ;
+  }
+
+  // Use with caution that the _rank and 
+  void BitSet(size_t i)
+  {
+    Utils::BitSet(_B, i) ;
+  }
+
+  void BitClear(size_t i)
+  {
+    Utils::BitClear(_B, i) ;
+  }
+
+  // W is the plain bit vector
+  void Init(const WORD *W, const size_t n)
+  {
+    _n = n ;
+    Malloc(n) ;
+    memcpy(_B, W, Utils::BitsToWordBytes(n)) ;
+
+    Init() ;
+  }
+ 
+  // This is for when _B is already allocated
+  void Init()
+  {
+    _space = Utils::BitsToWordBytes(_n) ;
+    _rank.Free() ;
+    _select.Free() ;
+    //_rank.Init(_rb, _B, _n) ;
+    _rank.Init(_B, _n) ;
+    _space += _rank.GetSpace() - sizeof(_rank) ;
+    _select.Init(_sb, _B, _n, _selectSpeed, _selectTypeSupport) ;
+    _space += _select.GetSpace() - sizeof(_select) ;
+  }
+
+  // Return the ith bits (0-based)
+  int Access(size_t i) const
+  {
+    return Utils::BitRead(_B, i) ;
+  }
+
+  // Return the number of 1s before i
+  size_t Rank1(size_t i, int inclusive = 1) const
+  {
+    return _rank.Query(i, _B, _n, inclusive) ;
+  }
+
+  // Return the index of th i-th (this i is 1-based, so rank and select are inversible) 1
+  size_t Select(size_t i) const
+  {
+    return _select.Query(i, _rank, _B, _n) ;
+  }
+
+  size_t Select(int type, size_t i) const
+  {
+    if (type == 1)
+      return _select.Query(i, _rank, _B, _n) ;
+    else
+      return _select.Query0(i, _rank, _B, _n) ;
+  }
+
+  // Pred/successor on bit 0
+  size_t Pred0(size_t i) const
+  {
+    return Select(0, Rank(0, i)) ;
+  }
+
+  size_t Succ0(size_t i) const
+  {
+    return Select(0, Rank(0, i, 0) + 1) ; 
+  }
+
+  size_t GetSpace() 
+  {
+    return _space + sizeof(*this) ;
+  }
+
+  const WORD *GetData() const
+  {
+    return _B ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Bitvector::Save(fp) ;
+    SAVE_VAR(fp, _n) ;
+    SAVE_VAR(fp, _rb) ;
+    SAVE_VAR(fp, _sb) ;
+    SAVE_VAR(fp, _selectSpeed) ;
+    SAVE_VAR(fp, _selectTypeSupport) ;
+    if (_n > 0)
+    {
+      fwrite(_B, sizeof(*_B), Utils::BitsToWords(_n), fp) ;
+      _rank.Save(fp) ;
+      _select.Save(fp) ;
+    }
+  } 
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    Bitvector::Load(fp) ;
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _rb) ;
+    LOAD_VAR(fp, _sb) ;
+    LOAD_VAR(fp, _selectSpeed) ;
+    LOAD_VAR(fp, _selectTypeSupport) ;
+    
+    if (_n > 0)
+    {
+      _B = Utils::MallocByBits(_n) ;
+      fread(_B, sizeof(*_B), Utils::BitsToWords(_n), fp) ;
+      _rank.Load(fp) ;
+      _select.Load(fp) ;
+    }
+    else
+    {
+      _B = NULL ;
+      //_rank.Free() ;
+      //_select.Free() ;
+    }
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Bitvector_RunLength.hpp b/compactds/Bitvector_RunLength.hpp
new file mode 100644
index 0000000..dfe8c07
--- /dev/null
+++ b/compactds/Bitvector_RunLength.hpp
@@ -0,0 +1,228 @@
+#ifndef _MOURISL_COMPACTDS_BITVECTOR_RUNLENGTH
+#define _MOURISL_COMPACTDS_BITVECTOR_RUNLENGTH
+
+#include "Utils.hpp"
+
+#include "Bitvector.hpp"
+#include "Bitvector_Sparse.hpp"
+#include "PartialSum.hpp"
+#include "SimpleVector.hpp"
+
+// The run-length bitvector built upon the sparse bit vector 
+// Based on section: 
+namespace compactds {
+class Bitvector_RunLength: public Bitvector
+{
+protected:
+  bool _zerofirst ; // whether the run-length array starts with 0 or _not.
+  int _partialSumSpeed ;
+  size_t _n ; // total _number of bits
+  size_t _rcnt ; // the _number of runs
+
+  PartialSum _R ; // the partial sum of runs
+  PartialSum _O ; // the partial sum of 1s
+public:
+  Bitvector_RunLength() 
+  {
+    _partialSumSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ;
+  } 
+  ~Bitvector_RunLength() {}
+  
+  void Free() 
+  {
+    _R.Free() ;
+    _O.Free() ;
+  }
+
+  size_t GetSpace()
+  {
+    return _R.GetSpace() - sizeof(_R) 
+      + _O.GetSpace() - sizeof(_O) + sizeof(*this) ;
+  }
+
+  void SetSelectSpeed(int speed)
+  {
+  }
+
+  void SetPartialSumSpeed(int _partialSumSpeed)
+  {
+    _R.SetSpeed(_partialSumSpeed) ;
+    _O.SetSpeed(_partialSumSpeed) ;
+  }
+
+  void SetSupportSelect(int supportSelect)
+  {
+    _O.SetSupportSearch(supportSelect) ;
+  }
+  
+  // W is the plain bit vector
+  void Init(const WORD *W, const size_t n)
+  {
+    size_t i ;
+    if (n == 0)
+      return ;
+    _n = n ;
+    _zerofirst =false ;
+    if (Utils::BitRead(W, 0) == 0)
+      _zerofirst = true ;
+    
+    WORD *B = Utils::MallocByBits(n + 1) ; // bits for the sums of runs
+    WORD *oneB = Utils::MallocByBits(n + 1) ; // bits for the sums of runs of 1s
+    Utils::BitSet(B, 0) ;
+    Utils::BitSet(oneB, 0) ;
+    size_t oneLen = 0 ; // run length for ones
+    int prevc = 0 ;
+    if (!_zerofirst)
+    {
+      prevc = 1 ;
+      oneLen = 1 ;
+    }
+
+    for (i = 1 ; i < n ; ++i) 
+    {
+      int c = Utils::BitRead(W, i) ;
+      if (c)
+        ++oneLen ;
+      if (c != prevc)
+      {
+        Utils::BitSet(B, i) ;
+        if (c == 0) // previous c == 1
+          Utils::BitSet(oneB, oneLen) ;
+      }
+      prevc = c ;
+    }
+
+    Utils::BitSet(B, n) ;
+    Utils::BitSet(oneB, oneLen) ;
+    _R.InitFromBitvector(B, n + 1) ;
+    _O.InitFromBitvector(oneB, oneLen + 1) ;
+    free(B) ;
+    free(oneB) ;
+    
+    /*size_t len = 1 ;
+    SimpleVector<int> rlens ;
+    rlens.Reserve(_n / WORDBITS + 1) ;
+    for (i = 1 ; i < _n ; ++i)
+    {
+      if (Utils::BitRead(W, i) != Utils::BitRead(W, i - 1))
+      {
+        rlens.PushBack(len) ; 
+        len = 1 ;
+      }
+      else
+        ++len ;
+    }
+    rlens.PushBack(len) ;
+    InitFromRunLength(rlens.BeginAddress(), rlens.Size(), _n, _zerofirst) ;*/
+  }
+
+  void InitFromRunLength(const int *rlens, const size_t rcnt, const size_t n, const bool zerofirst)
+  {
+    this->_rcnt = rcnt ;
+    this->_n = n ;
+    this->_zerofirst = zerofirst ;
+  
+    _R.Init(rlens, _rcnt) ;
+
+    size_t i = 0 ;
+    uint64_t *oneSums = (uint64_t *)malloc(sizeof(*oneSums) * (_rcnt / 2 + 2));
+  
+    if (_zerofirst)
+      i = 1 ;
+    
+    uint64_t psum = 0 ; 
+    for ( ; i < _rcnt ; i += 2)
+    {
+      oneSums[i/2] = psum ;
+      psum += rlens[i] ;
+    }
+    oneSums[i/2] = psum ;
+    _O.InitFromPartialSum(oneSums, i/2) ;
+    free(oneSums) ;
+  }
+    
+  // Return the ith bits (0-based)
+  int Access(size_t i) const
+  {
+    size_t ri = _R.Search(i) ;
+    int inOne = (ri&1) ;  //whether ri is block for 1 or 0
+    if (!_zerofirst)
+      inOne = 1 - inOne ;
+    return inOne ;
+  }
+
+  // Return the _number of 1s before i
+  size_t Rank1(size_t i, int inclusive = 1) const 
+  {
+    size_t ri = _R.Search(i) ; // run-length block index 
+    size_t oi = ri / 2 ;
+    int inOne = (ri&1) ;  //whether ri is block for 1 or 0
+    if (!_zerofirst)
+      inOne = 1 - inOne ;
+    if (!inOne)
+    {
+      // Each small block is a (00..11..) xx,
+      //   or (11..00..) runs,
+      //   so we _need to adjust whether we want the sum of 1's include the current small bock or _not.
+      if (_zerofirst)
+        return _O.Sum(oi) ;
+      else
+        return _O.Sum(oi + 1) ;
+    }
+    else
+    {
+      //The sum of 1s before current run and the _number of 1s in the current block
+      return _O.Sum(oi) + (i - _R.Sum(ri) + inclusive) ;   
+    }
+  }
+  
+  // Return the index of th i-th (i is 1-based, so rank and select are inversible) 1
+  // Did not have a select mode for 0 here.
+  size_t Select(size_t i) const  
+  {
+    if (i == 0)
+      return POSITIVE_INF ;
+
+    --i ;
+    size_t oi = _O.Search(i) ;
+    // Map oi back to the ri 
+    size_t ri = 2 * oi ;
+    if (_zerofirst)
+      ++ri ;
+    return _R.Sum(ri) + (i - _O.Sum(oi)) ;
+  }
+
+  size_t Select(int type, size_t i) const
+  {
+    if (type == 1)
+      return Select(i) ;
+    else
+      return POSITIVE_INF ; 
+  }
+
+  void Save(FILE *fp)
+  {
+    Bitvector::Save(fp) ;
+    SAVE_VAR(fp, _zerofirst) ; 
+    SAVE_VAR(fp, _partialSumSpeed) ; 
+    SAVE_VAR(fp, _n) ; 
+    SAVE_VAR(fp, _rcnt) ; 
+    _R.Save(fp) ;
+    _O.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    Bitvector::Load(fp) ;
+    LOAD_VAR(fp, _zerofirst) ; 
+    LOAD_VAR(fp, _partialSumSpeed) ; 
+    LOAD_VAR(fp, _n) ; 
+    LOAD_VAR(fp, _rcnt) ; 
+    _R.Load(fp) ; 
+    _O.Load(fp) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Bitvector_Sparse.hpp b/compactds/Bitvector_Sparse.hpp
new file mode 100644
index 0000000..c20ad35
--- /dev/null
+++ b/compactds/Bitvector_Sparse.hpp
@@ -0,0 +1,336 @@
+#ifndef _MOURISL_COMPACTDS_BITVECTOR_SPARSE
+#define _MOURISL_COMPACTDS_BITVECTOR_SPARSE
+
+#include "Utils.hpp"
+#include "Bitvector.hpp"
+#include "FixedSizeElemArray.hpp"
+#include "Bitvector_Plain.hpp"
+
+// The very sparse bitvector based on chaptor 4.4
+// This data structure seems can be used for set predecessor query when 
+// the elements are increasing?
+//
+// This is a super clever data structure. Come and think about it from time to time.
+namespace compactds {
+class Bitvector_Sparse: public Bitvector
+{
+private:
+  size_t _n ; // total length of the bit vector
+  size_t _onecnt ; // number of 1s in the bit vector
+  size_t _lastOneIdx ; // the index of the last one
+  int _lowerBits ; // the split for lower and upper bits
+
+  FixedSizeElemArray _L ; // stores lower bits (of size r)
+  Bitvector_Plain _H ; // stores the higher bits
+
+  int _hSelectSpeed ; // this is the speed for H
+public:
+  Bitvector_Sparse() 
+  {
+    _lowerBits = 0 ;
+    _hSelectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ;
+  } 
+  ~Bitvector_Sparse() 
+  {
+    Free() ;
+  }
+  
+  void Free()
+  {
+    _n = 0 ;
+    _L.Free() ;
+    _H.Free() ;
+  }
+
+  size_t GetSpace()
+  {
+    return _space + _L.GetSpace() - sizeof(_L) + 
+      _H.GetSpace() - sizeof(_H) + sizeof(*this) ;
+  }
+
+  void SetLowerBits(int lowerBits)
+  {
+    this->_lowerBits = lowerBits ;
+  }
+
+  // the speed is for H.select
+  void SetSpeed(int speed)
+  {
+    _H.SetSelectSpeed(speed) ;
+  }
+
+  void SetSupportRank(bool supportRank)
+  {
+    _H.SetSelectTypeSupport(supportRank ? 3 : 2) ;
+  }
+
+  size_t GetLastOneIdx()
+  {
+    return _lastOneIdx ;
+  }
+
+  size_t GetOneCnt()
+  {
+    return _onecnt ;
+  }
+
+  // Init directly from the bit vector
+  // W is the plain bit vector
+  void Init(const WORD *W, const size_t n) 
+  {
+    size_t i, k ;
+    size_t wordCnt = Utils::BitsToWords(n) ;
+
+    _n = n ; 
+    _onecnt = 0 ;
+    for (i = 0 ; i < wordCnt ; ++i)
+      _onecnt += Utils::Popcount(W[i]) ;
+    
+    if (_onecnt == 0 || wordCnt == 0)
+    {
+      _lastOneIdx = 0 ;
+      return ;
+    }
+    
+    // Get the last 1
+    i = wordCnt - 1 ;
+    while (1)
+    {
+      if (W[i] == 0)
+      {
+        --i ;
+        continue ;
+      }
+      else
+      {
+        int j ;
+        for (j = WORDBITS - 1 ; j >= 0 ; --j)
+        {
+          if ((W[i] >> j) & 1)
+          {
+            _lastOneIdx = i * WORDBITS + j ;
+            break ;
+          }
+        }
+        break ;
+      }
+      if (i == 0)
+        break ;
+      else
+        --i ;
+    }
+
+    if (_lowerBits == 0)
+      _lowerBits = int(log((double)n / _onecnt) / log(2.0)) ;
+
+    if (_lowerBits < 1)
+      _lowerBits = 1 ;
+    _L.Malloc(_lowerBits, _onecnt) ;
+
+    size_t hsize = (_lastOneIdx >> _lowerBits) + _onecnt + 1; // need +1 here to accommdate the max value
+    ++hsize ; // Plus one here is because we want to append a 0 to the last block
+    _H.Malloc(hsize) ;
+
+    k = 0 ;
+    for (i = 0 ; i < n ; i += WORDBITS)
+    {
+      WORD w = W[i/WORDBITS] ;
+      if (w == 0) 
+        continue ;
+      size_t j ;
+      for (j = 0 ; j < WORDBITS && i + j < n ; ++j)
+        if ((w >>j) & 1) 
+        {
+          _L.Write(k, (i + j) & MASK(_lowerBits)) ;
+          _H.BitSet(((i + j) >> _lowerBits) + k) ;
+          ++k ;
+        }
+    }
+    _H.Init() ;
+  }
+  
+  // Init from the know positions of 1s
+  void InitFromOnes(const uint64_t *S, const size_t onecnt, const size_t n)
+  {
+    size_t i ;
+
+    _n = n ;
+    _onecnt = onecnt ;
+    if (_onecnt > 0)
+      _lastOneIdx = S[onecnt - 1] ;
+    else
+    {
+      _lastOneIdx = 0 ;
+      return ;
+    }
+
+    if (_lowerBits == 0)
+      _lowerBits = int(log((double)n / onecnt) / log(2.0)) ;
+    
+    if (_lowerBits < 1)
+      _lowerBits = 1 ;
+    _L.Malloc(_lowerBits, onecnt) ;
+
+    size_t hsize = (S[onecnt - 1] >> _lowerBits) + onecnt + 1; // need +1 here to accommdate the max value
+    ++hsize ; // Plus one here is because we want to append a 0 to the last block
+    _H.Malloc(hsize) ; 
+    for (i = 0 ; i < onecnt ; ++i)
+    {
+      _L.Write(i, S[i] & MASK(_lowerBits)) ;
+      _H.BitSet((S[i] >> _lowerBits) + i) ;
+    }
+    _H.Init() ;
+  }
+  
+  // Return the ith bits (0-based)
+  int Access(size_t i) const
+  {
+    if (Pred(i) == i)
+      return 1 ;
+    else
+      return 0 ;
+  }
+
+  // Return the number of 1s before i
+  size_t Rank1(size_t i, int inclusive = 1) const 
+  {
+    if (inclusive == 0)
+    {
+      if (i == 0)
+        return 0 ;
+      else
+        --i ; 
+    }
+
+    if (i >= _lastOneIdx) // this should contains the case that i>=n
+      return _onecnt ;
+    
+    size_t iH = i >> _lowerBits ;
+    size_t iL = i & MASK(_lowerBits) ;
+    size_t l, m, r ;
+    
+    // We don't want to +1 for iH in select because the
+    // 0 marks the beginning the block with starts with iH<<r
+    //
+    // The difference between Select(0, iH) and iH is the number 
+    // of 1s in the range in the range of [0, iH<<r)
+    // (This is because each observed 1 will shift Select right by 1)
+    // Therefore, this gives the range in L, because L did not miss anything
+    //   and has the number 
+    size_t selectIH = 0 ;
+    if (iH == 0)
+      l = 0 ;
+    else
+    {
+      selectIH = _H.Select(0, iH) ;
+      l = selectIH - (iH - 1) ; // Note that the input to Select is 1-based, and the shift is 0-based, so we add (iH - 1) here.
+    }
+    
+    // When i >= last one index, H.Select(iH)==H.Select(iH+1),
+    //   then l>r.
+    //   Fortunately, we handle this case at the beginning.
+    if (iH == 0 || _H.Access( selectIH + 1 ) != 0)
+      r = _H.Select(0, iH + 1) - iH ;
+    else
+      r = selectIH + 1 - iH ;
+
+    if (l == r || _L.Read(l) > iL)
+    {
+      // The current r block is empty
+      // or the first element in the block is greater than what we search for.
+      // So the number of 1s before current block (l) is the answer
+      return l ; 
+    }
+
+    // r points to the start of the next r block, so we need -1
+    //   to make it match with the end of the current block
+    // The l==r test above makes sure r-1 is non-negative here.
+    --r ;
+    while (l <= r)
+    {
+      m = (l + r) / 2 ;
+      if (_L.Read(m) <= iL)
+        l = m + 1 ;
+      else
+      {
+        if (r == 0) 
+          break ; // the test before the binary search make sure at least one element
+                  // in the block is less than the desired target,
+                  // so we can directly termiante the binary search.
+        else
+          r = m - 1 ;
+      }
+    }
+    return l ; // l-1 is the last element index <= the desired one, so l is the number element   
+  }
+
+  // Return the index of th i-th (i is 1-based, so rank and select are inversible) 1
+  size_t Select(size_t i) const
+  {
+    if (i > _onecnt)
+      return _lastOneIdx ;
+    if (i == 0)
+      return POSITIVE_INF ;
+    // Use (i-1) instead of i to convert the 1-based to 0-based, which is
+    //  the base when creating H.
+    return ((_H.Select(1, i) - (i - 1)) << _lowerBits) + _L.Read(i - 1) ;
+  }
+
+  size_t Select(int type, size_t i) const
+  {
+    if (type == 1)
+      return Select(i) ;
+    else
+    {
+      // Sadly, we can only do plain binary search 
+      // Haven't tested it yet.
+      // Don't recommend this operation.
+      size_t l = 0 ;
+      size_t r = _n - 1 ;
+      size_t m ;
+
+      while (l <= r)
+      {
+        m = (l + r) / 2 ;
+        if (m - Select(m) < i)
+          l = m + 1 ;
+        else
+        {
+          if (m == 0)
+            return 0 ;
+          else
+            r = m - 1 ;
+        }
+      }
+      return r + 1 ;
+    }
+  }
+
+  void Save(FILE *fp)
+  {
+    Bitvector::Save(fp) ;
+    SAVE_VAR(fp, _n) ;
+    SAVE_VAR(fp, _onecnt) ;
+    SAVE_VAR(fp, _lastOneIdx) ;
+    SAVE_VAR(fp, _lowerBits) ;
+    SAVE_VAR(fp, _hSelectSpeed) ;
+    _L.Save(fp) ;
+    _H.Save(fp) ;
+  }
+  
+  void Load(FILE *fp)
+  {
+    Free() ;
+    Bitvector::Load(fp) ;
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _onecnt) ;
+    LOAD_VAR(fp, _lastOneIdx) ;
+    LOAD_VAR(fp, _lowerBits) ;
+    LOAD_VAR(fp, _hSelectSpeed) ;
+    _L.Load(fp) ;
+    _H.Load(fp) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/CompactMapper.hpp b/compactds/CompactMapper.hpp
new file mode 100644
index 0000000..661f5f4
--- /dev/null
+++ b/compactds/CompactMapper.hpp
@@ -0,0 +1,130 @@
+#ifndef _MOURISL_COMPACTDS_COMPACTMAPPER
+#define _MOURISL_COMPACTDS_COMPACTMAPPER
+
+// Map a set of m distinct elements to [0,m-1]
+#include <algorithm>
+#include <map>
+#include <vector>
+
+#include "FixedSizeElemArray.hpp"
+#include "Bitvector_Plain.hpp"
+#include "Bitvector_Sparse.hpp"
+
+namespace compactds {
+class CompactMapper
+{
+private:
+  bool _sparse ; // whether use sparse representation
+  Bitvector_Plain _P ;
+  Bitvector_Sparse _S ;
+  size_t _m ;
+public:
+  CompactMapper()
+  {
+  }
+
+  ~CompactMapper()
+  {
+    Free() ;
+  }
+
+  size_t GetSpace(int inclusive = true)
+  {
+    return _P.GetSpace() - sizeof(_P) + _S.GetSpace() - sizeof(_S) + (inclusive ? sizeof(*this) : 0) ;
+  }
+  
+  void Free()
+  {
+    _P.Free() ;
+    _S.Free() ;
+  }
+
+  void Init(const FixedSizeElemArray &a, size_t n, bool sparse)
+  {
+    size_t i ;
+    _sparse = sparse ;
+    if (sparse)
+    {
+      std::map<size_t, size_t> reduceMap ;
+      std::vector<size_t> elems ;
+      size_t max = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        size_t tmp = a.Read(i) ;
+        if (reduceMap.find(tmp) == reduceMap.end())
+        {
+          reduceMap[tmp] = i ;
+          elems.push_back(tmp) ;
+          if (tmp > max)
+            max = tmp ;
+        }
+      }
+      std::sort(elems.begin(), elems.end()) ;
+      _m = elems.size() ;
+      _S.InitFromOnes(elems.data(), max + 1, _m) ;
+    }
+    else
+    {
+      size_t max = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        size_t tmp = a.Read(i) ;
+        if (tmp > max)
+          max = tmp ;
+      }
+      
+      _P.Malloc(max + 1) ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        size_t tmp = a.Read(i) ;
+        _P.BitSet(tmp) ;
+      }
+      _P.Init() ;
+      _m = _P.Rank1(max) ;
+    }
+  }
+
+  size_t GetCompactSize() const
+  {
+    return _m ;
+  }
+
+  size_t Map(size_t v) const
+  {
+    if (_sparse)
+      return _S.Rank1(v, 0) ;
+    else
+      return _P.Rank1(v, 0) ;
+  }
+
+  size_t MapBack(size_t i) const
+  {
+    if (_sparse)
+      return _S.Select(i + 1) ;
+    else
+      return _P.Select(i + 1) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _sparse) ;
+    SAVE_VAR(fp, _m) ;
+    if (_sparse)
+      _S.Save(fp) ;
+    else
+      _P.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    LOAD_VAR(fp, _sparse) ;
+    LOAD_VAR(fp, _m) ;
+    if (_sparse)
+      _S.Load(fp) ;
+    else
+      _P.Load(fp) ;
+  }
+} ;
+} 
+
+#endif
diff --git a/compactds/CompressedSuffixArray.hpp b/compactds/CompressedSuffixArray.hpp
new file mode 100644
index 0000000..d0e7cdc
--- /dev/null
+++ b/compactds/CompressedSuffixArray.hpp
@@ -0,0 +1,154 @@
+#ifndef _MOURISL_COMPACTDS_COMPRESSED_SUFFIX_ARRAY
+#define _MOURISL_COMPACTDS_COMPRESSED_SUFFIX_ARRAY
+
+#include "Bitvector_Sparse.hpp"
+#include "Sequence_WaveletTree.hpp"
+
+namespace compactds {
+class CompressedSuffixArray
+{
+private:
+  size_t _space ;
+  Bitvector_Sparse *_Psi ; // Psi for each alphabet
+  Bitvector_Sparse _D ; // mark the positions of starting alphabet in suffix
+  Alphabet _alphabets ; // use plain alphabet set here.
+  size_t _n ;
+  size_t *_alphabetPartialSum ;
+  size_t _firstISA ;
+  ALPHABET _lastChr ;  
+  WORD **_psiB ; // bits for encoding Psis
+  
+  size_t Rank(Sequence_WaveletTree<Bitvector_Plain> &BWT, ALPHABET c, size_t p, int inclusive = 1)
+  {
+    size_t ret = BWT.Rank(c, p, inclusive) ;
+    // Since we do not use $, the last character in the original string 
+    //   will be moved to the _firstISA instead of the first position
+    //   We need to move this back
+    // Potential future refactoring: appending an A to the end of the string
+    if (c == _lastChr && (p < _firstISA || (!inclusive && p == _firstISA)))
+      ++ret ;
+    return ret ;
+  }
+
+public:
+  CompressedSuffixArray() 
+  {
+    _n = _space = 0 ;
+  }
+  ~CompressedSuffixArray() {}
+  
+  void Free()
+  {
+    if (_n > 0)
+    {
+      delete[] _Psi ;
+    }
+    
+    if (_psiB != NULL)
+    {
+      int alphabetSize = _alphabets.GetSize() ;
+      int i ;
+      for (i = 0 ; i < alphabetSize ; ++i)
+        free(_psiB[i]) ;   
+      free(_psiB) ;
+    }
+  }
+  
+  // Allocate necessary memories for reading SAs 
+  void Prepare(size_t n, ALPHABET *alphabetList)
+  {
+    int i ;
+
+    _n = n ;
+    _alphabets.InitFromList(alphabetList, strlen(alphabetList)) ;
+    
+    int alphabetSize = _alphabets.GetSize() ;
+    _Psi = new Bitvector_Sparse[alphabetSize] ;
+    _psiB = (WORD **)malloc(sizeof(_psiB[0]) * alphabetSize) ;
+    for (i = 0 ; i < alphabetSize ; ++i)
+      _psiB[i] = Utils::MallocByBits(n) ;
+    _alphabetPartialSum = (size_t *)calloc(alphabetSize + 1, sizeof(size_t)) ;
+  }
+  
+  
+  // sa corresponding to SA[from..to], inclusive
+  void ReadSaChunk(FixedSizeElemArray &T, size_t n, size_t *sa, size_t from, size_t to)
+  {
+    size_t i ;
+
+    if (to >= n)
+      to = n - 1 ;
+    int alphabetSize = _alphabets.GetSize() ;
+    size_t *alphabetCount = (size_t *)calloc(alphabetSize + 1, sizeof(size_t)) ;
+    for (i = from ; i <= to ; ++i)
+    {
+      size_t s = sa[i - from] ;
+      if (s == 0)
+        continue ;
+      int c = T.Read(s - 1) ;
+      Utils::BitSet(_psiB[c], s) ;
+      ++alphabetCount[c] ;   
+    }
+    _lastChr = T.Read(n - 1) ;
+    for (i = 1 ; i < alphabetSize ; ++i)
+      alphabetCount[i] += alphabetCount[i - 1] ;
+    for (i = 0 ; i < alphabetSize ; ++i)
+      _alphabetPartialSum[i + 1] += alphabetCount[i] ;
+  }
+
+  // Compress everything
+  void Init()
+  {
+    size_t i ;
+    int alphabetSize = _alphabets.GetSize() ;
+    
+    for (i = 0 ; i < alphabetSize ; ++i)
+      _Psi[i].Init(_psiB[i], _n) ;
+
+    for (i = 0 ; i < alphabetSize ; ++i)
+      free(_psiB[i]) ;   
+    free(_psiB) ;
+    _psiB = NULL ;
+  }
+
+  void InitFromBWT(FixedSizeElemArray &BWT, size_t n, size_t firstISA, ALPHABET *alphabetList)
+  {
+    size_t i ;
+    _n = n ;
+    _firstISA = firstISA ;
+
+    Prepare(n, alphabetList) ;
+
+    // Prepare the auxiliary data, e.g. rank, for BWT
+    int alphabetSize = _alphabets.GetSize() ;
+    _alphabetPartialSum = (size_t *)calloc(alphabetSize, sizeof(size_t)) ;
+    
+    for (i = 0 ; i < n ; ++i)
+      ++_alphabetPartialSum[ BWT.Read(i)] ;
+    for (i = 1 ; i < alphabetSize ; ++i)
+      _alphabetPartialSum[i] += _alphabetPartialSum[i - 1] ;
+    for (i = alphabetSize ; i > 0 ; --i)
+      _alphabetPartialSum[i] = _alphabetPartialSum[i - 1] ;
+    _alphabetPartialSum[0] = 0 ;
+    _lastChr = alphabetList[ BWT.Read(firstISA) ] ;
+    
+    Sequence_WaveletTree<Bitvector_Plain> seqBWT ;
+    seqBWT.Init(BWT, n, alphabetList) ;
+  
+    // Compute Psi from BWT. The Psi for each alphabet is marked on the bit array psiB
+    size_t lastISA = _alphabetPartialSum[ _alphabets.Encode(_lastChr) ] ; 
+    size_t p = lastISA;
+    for (i = 0 ; i < n ; ++i)
+    {
+      int c = BWT.Read(p) ;
+      size_t lf = _alphabetPartialSum[c] + Rank(seqBWT, alphabetList[c], p) ; 
+      Utils::BitSet(_psiB[c], p) ; // Psi[lf] = p. Psi is the inverse function of LF mapping  
+      p = lf ;
+    }
+
+    Init() ;
+  }    
+} ;
+}
+
+#endif 
diff --git a/compactds/DS_InvPermutation.hpp b/compactds/DS_InvPermutation.hpp
new file mode 100644
index 0000000..186d583
--- /dev/null
+++ b/compactds/DS_InvPermutation.hpp
@@ -0,0 +1,137 @@
+#ifndef _MOURISL_COMPACTDS_DS_INVPERMUTATION
+#define _MOURISL_COMPACTDS_DS_INVPERMUTATION
+
+#include "Utils.hpp"
+#include "Bitvector_Sparse.hpp"
+#include "Bitvector_Plain.hpp"
+
+// The standalone data structure for inverse query on a plain permutation using the idea of short churt.
+// Time complexity: O(t)
+// Space complexity: O(n/t * logn)
+// Based Chapter 5.1. Difference is that the book samples is one off than this implementation
+// This could be also useful for encoding the inverse function of an 1-to-1 mapping
+
+namespace compactds {
+class DS_InvPermutation
+{
+private:
+  size_t _t ; // step size
+  size_t _space ;
+  Bitvector_Plain _B ; // mark whether a position is sampled
+  FixedSizeElemArray _S ; // sampled pointer with value Pi^{-t}[x]
+  size_t _sampledCnt ; // |_S| 
+public:
+  DS_InvPermutation()
+  {
+    _space = 0 ;
+    _t = 0 ;
+  }
+
+  ~DS_InvPermutation()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    _S.Free() ;
+  }
+
+  size_t GetSpace()
+  {
+    return _space + _B.GetSpace() - sizeof(_B) + sizeof(*this) ;
+  }
+
+  void SetSampleRate(size_t t)
+  {
+    _t = t ;
+  }
+
+  void Init(size_t *Pi, size_t n)
+  {
+    if (_t == 0)
+      _t = Utils::Log2Ceil(n) ;
+    WORD *B = Utils::MallocByBits(n) ; // the label sampled positions
+    WORD *V = Utils::MallocByBits(n) ; // the bits mark the cycles
+    
+    size_t i, j, k ;
+    
+    _sampledCnt = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (Utils::BitRead(V, i))
+        continue ;
+      Utils::BitSet(V, i) ;
+      j = Pi[i] ;
+      k = 1 ;
+      while (j != i)
+      {
+        Utils::BitSet(V, j) ;
+        if (k % _t == 0)
+        {
+          Utils::BitSet(B, j) ;
+          ++_sampledCnt ;
+        }
+        j = Pi[j] ;
+        ++k ;
+      }
+
+      if (k > _t) // may exist dangling part, without this, the time could be 2*_t-1
+      {
+        Utils::BitSet(B, i) ;
+        ++_sampledCnt ;
+      }
+    }
+
+    _B.Init(B, n) ;
+    _S.Malloc(Utils::Log2Ceil(n), _sampledCnt) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (!Utils::BitRead(V, i))
+        continue ;
+
+      Utils::BitFlip(V, i) ;
+      j = Pi[i] ;
+      while (Utils::BitRead(V, j))
+      {
+        if (Utils::BitRead(B, j))
+        {
+          // Since B[j]==1, use inclusive==0 automatically subtract the rank value by 1
+          _S.Write( _B.Rank(1, j, 0), i) ;
+          i = j ;
+        }
+        Utils::BitFlip(V, j) ;
+        j = Pi[j] ;
+      }
+      if (Utils::BitRead(B, j))
+      {
+        _S.Write( _B.Rank(1, j, 0), i) ;
+      }
+      i = j ;
+    }
+
+    free(B) ;
+    free(V) ;
+  }
+
+  //@return: Pi^{-1}[i]
+  size_t Query(size_t *Pi, size_t i)
+  {
+    size_t j = i ;
+    bool jumped = false ;
+    while (Pi[j] != i)
+    {
+      if (!jumped && _B.Access(j))
+      {
+        j = _S.Read(_B.Rank(1, j, 0)) ;
+        jumped = true ;
+      }
+      else
+        j = Pi[j] ;
+    }
+    return j ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/DS_Parenthesis.hpp b/compactds/DS_Parenthesis.hpp
new file mode 100644
index 0000000..8429eb3
--- /dev/null
+++ b/compactds/DS_Parenthesis.hpp
@@ -0,0 +1,150 @@
+#ifndef _MOURISL_COMPACTDS_PARENTHESIS
+#define _MOURISL_COMPACTDS_PARENTHESIS
+
+#include "Utils.hpp"
+#include "DS_RangeMinMaxTree.hpp"
+#include "DS_PatternRankSelect.hpp"
+
+namespace compactds {
+class DS_Parenthesis
+{
+private:
+  DS_RangeMinMaxTree _rmmTree ;
+  DS_PatternRankSelect _patRS ;
+  
+  void GenerateRandomBalanceParenthesisSegment(WORD *B, size_t n, size_t i, size_t j)
+  {
+    if (j == i + 1)
+    {
+      Utils::BitsWrite(B, i, j, 2) ; // write binary 10 
+      return ;
+    }
+    else if (j <= i)
+    {
+      return ;
+    }
+
+    size_t split = i + rand() % (j - i + 1) ;
+    while ((split - i + 1) % 2 == 1 )
+      split = i + rand() % (j - i + 1) ;
+    
+    Utils::BitSet(B, i) ;
+    Utils::BitClear(B, j) ;
+    if (split == i || split == j)
+      GenerateRandomBalanceParenthesisSegment(B, n, i + 1, j - 1) ;
+    else
+    {
+      Utils::BitClear(B, split) ;
+      Utils::BitSet(B, split + 1) ;
+      GenerateRandomBalanceParenthesisSegment(B, n, i + 1, split - 1) ;
+      GenerateRandomBalanceParenthesisSegment(B, n, split + 2, j - 1) ;
+    }
+  }
+
+public:
+  DS_Parenthesis() {} 
+  ~DS_Parenthesis() {}
+
+  void Free() 
+  {
+    _rmmTree.Free() ;
+    _patRS.Free() ;
+  }
+
+  void SetRmmTreeBlockSize(size_t b)
+  {
+    _rmmTree.SetBlockSize(b) ;
+  }
+
+  size_t GetSpace(bool inclusive = true)
+  {
+    return _rmmTree.GetSpace(false) + (inclusive ? sizeof(*this) : 0) ; 
+
+  }
+
+  void Init(const WORD *B, size_t n, WORD pat, int patLen)
+  {
+    _rmmTree.Init(B, n) ;
+    if (patLen > 0)
+      _patRS.Init(B, n, pat, patLen) ;
+  }
+
+  // Expose the internal rmmTree.
+  const DS_RangeMinMaxTree& GetRmmTree() const
+  {
+    return _rmmTree ;
+  }
+
+  size_t Close(size_t i, const WORD *B, size_t n) const
+  {
+    // Notice that our FwdSearch include the effect of i, so the d is slightly different than the textbook.
+    return _rmmTree.FwdSearch(i, 0, B, n) ;  
+  }
+
+  size_t Open(size_t i, const WORD *B, size_t n) const
+  {
+    return _rmmTree.BwdSearch(i, 0, B, n) ; 
+  }
+
+  size_t Enclose(size_t i, const WORD *B, size_t n) const
+  {
+    return _rmmTree.BwdSearch(i, -1 - Utils::BitRead(B, i), B, n) ;
+  }
+
+  bool IsBalance(const WORD *B, size_t n) const
+  {
+    size_t i ;
+    int64_t excess = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      excess += (2 * Utils::BitRead(B, i) - 1 ) ;
+      if (excess < 0)
+        return false ;
+    }
+    return true ;
+  }
+
+  size_t PatternRank(size_t i, const WORD *B, size_t n, int inclusive = 1) const
+  {
+    return _patRS.Rank(i, B, n, inclusive) ;
+  }
+
+  size_t PatternSelect(size_t i, const WORD *B, size_t n) const 
+  {
+    return _patRS.Select(i, B, n) ;
+  }
+
+  void GenerateRandomBalanceParenthesis(WORD *B, size_t n, int seed = 17)
+  {
+    srand(seed) ;
+    GenerateRandomBalanceParenthesisSegment(B, n, 0, n - 1) ; 
+  }
+
+  void Print(FILE *fp, const WORD *B, size_t n)
+  {
+    size_t i ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (Utils::BitRead(B, i) == 1)
+        fprintf(fp, "(") ;
+      else
+        fprintf(fp, ")") ;
+    }
+    fprintf(fp, "\n") ;
+  }
+
+  void Save(FILE *fp)
+  {
+    _rmmTree.Save(fp) ;
+    _patRS.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    _rmmTree.Load(fp) ;
+    _patRS.Load(fp) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/DS_PatternRankSelect.hpp b/compactds/DS_PatternRankSelect.hpp
new file mode 100644
index 0000000..2e87782
--- /dev/null
+++ b/compactds/DS_PatternRankSelect.hpp
@@ -0,0 +1,272 @@
+#ifndef _MOURISL_COMPACTDS_DS_PATTERN_RANK_SELECT
+#define _MOURISL_COMPACTDS_DS_PATTERN_RANK_SELECT
+
+// Binary search based method to calculate  for pattern (not bit, but several bits). 
+// The tree structure looks like range min max tree, where we pre-record the information into blocks.
+#include "Utils.hpp"
+
+namespace compactds {
+class DS_PatternRankSelect
+{
+private:
+  size_t _r ; // number of regions
+  size_t _b ; // block size
+  size_t _height ; 
+  size_t *_counts ; // pattern count in each tree node
+  WORD _pat ;
+  int _patLen ; // pat is at most 64bits
+  size_t _space ;
+
+  // Maps leaf id k to the tree idx
+  size_t LeafNum(size_t k) const
+  {
+    const size_t width = 1ull<<_height ;
+    if (k < 2 * _r - width)
+      return width - 1 + k ; 
+    else
+      return width - 1 - _r + k ;
+  }
+
+  // Maps tree index to leaf index 
+  size_t NumLeaf(size_t v) const
+  {
+    const size_t width = 1ull<<_height ;
+    if (v >= width - 1)
+      return v - width + 1 ;
+    else
+      return v - width + 1 + _r ;
+  }
+
+  //@return: the 0-based level of tree index v located at
+  int GetLevel(size_t v) const
+  {
+    return Utils::CountBits(v + 1) - 1 ;
+  }
+
+  // The max tree index that is at the same level as v
+  size_t LevelMaxNum(size_t v) const
+  {
+    return (1ull<<(GetLevel(v) + 1)) - 2 ;
+  }
+
+  // The min tree index that is at the same level as v
+  size_t LevelMinNum(size_t v) const
+  {
+    return LevelMaxNum(v) / 2 ;
+  }
+
+  // Find the tree index containing v, l levels higher
+  size_t PromoteLevel(size_t v, int l) const
+  {
+    return ((v+1) >> l) - 1;
+  }
+
+  // Get the leftmost and rightmost leaf id (in leaf idx)
+  // Haven't validated yet.
+  size_t GetLeftmostLeaf(size_t v) const
+  {
+    size_t l = GetLevel(v) ;
+    size_t diff = v - LevelMinNum(v) ;
+    // Each node on this level covers chunk amount of leaves
+    size_t chunk = (1ull << (_height - l)) ; 
+    size_t ret = (1 << _height) - 1 + diff * chunk ;
+    // Pretend this is a complete tree, and then adjust the extra leaves
+    if (ret > 2 * _r - 1)
+      ret /= 2 ;
+    return NumLeaf(ret) ;
+  }
+
+  size_t GetRightmostLeaf(size_t v) const
+  {
+    size_t l = GetLevel(v) ;
+    size_t diff = v - LevelMinNum(v) ;
+    // Each node on this level covers chunk amount of leaves
+    size_t chunk = (1ull << (_height - l)) ; 
+    size_t ret = (1 << _height) - 1 + (diff + 1) * chunk - 1 ;
+    // Pretend this is a complete tree, and then adjust the extra leaves
+    if (ret > 2 * _r - 1)
+      ret = (ret - 1) / 2 ;
+    return NumLeaf(ret) ;
+  }
+
+public:
+  DS_PatternRankSelect()
+  {
+    _space = _b = _r = 0 ;
+    _counts = NULL ;
+  }
+
+  ~DS_PatternRankSelect()
+  {
+    Free() ;
+  }
+
+  size_t GetSpace(bool inclusive = true)
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ;
+  }
+  
+  void Free()
+  {
+    if (_counts)
+    {
+      free(_counts) ;
+      _counts = NULL ;
+      _r = _b = _space = 0 ;
+    }
+  }
+
+  void SetBlockSize(size_t b)
+  {
+    _b = b ;
+  }
+
+  void Init(const WORD *B, size_t n, WORD pat, int patLen)
+  {
+    size_t i, j ;
+    _pat = pat ;
+    _patLen = patLen ;
+
+    if ((int)_b <= patLen)
+      // Take the block size as word bits to maintain low space usage
+      _b = 16 * WORDBITS  ; 
+
+    _r = DIV_CEIL(n, _b) ; 
+    _height = Utils::Log2Ceil(_r) ;
+    
+    _counts = (size_t *)malloc(sizeof(size_t) * (2 * _r - 1)) ;
+    _space = sizeof(*_counts) * (2*_r - 1) ;
+
+    // Fill the leaves 
+    for (i = 0 ; i < n ; i += _b)
+    {
+      size_t l = i / _b ; // leaf id 
+      size_t ltid = LeafNum(l) ;
+      size_t count = 0 ;
+      // the count include the positions that may stretch to the next block
+      for (j = i ; j + _patLen - 1 < n && j < i + _b ; ++j)
+      {
+        WORD w = Utils::BitsRead(B, j, j + _patLen - 1) ;  
+        if (w == pat)
+          ++count ;
+      }
+      _counts[ltid] = count ;
+    }
+
+    // Fill the internal nodes
+    for (i = _r - 2 ; i < _r ; --i)
+    {
+      size_t count = _counts[2 * i + 1] ;
+      if (2 * i + 2 < (2 * _r - 1))
+        count += _counts[2 * i + 2] ;
+      _counts[i] = count ;
+    }
+  }
+
+  size_t Rank(size_t i, const WORD *B, size_t n, int inclusive = 1) const
+  {
+    if (!inclusive)
+    {
+      if (i == 0)
+        return 0 ;
+      else
+        --i ;
+    }
+    size_t j ;
+    size_t lid = i / _b ;
+    size_t rank = 0 ;
+    size_t v = 0 ;
+  
+    while (2 * v + 2 < 2 * _r - 1)
+    {
+      if (lid <= GetRightmostLeaf(2 * v + 1))
+      {
+        v = 2 * v + 1 ;
+      }
+      else
+      {
+        rank += _counts[2 * v + 1] ;
+        v = 2 * v + 2 ;
+      }
+    }
+    
+    for (j = i/_b * _b ; j <= i && j + _patLen - 1 < n; ++j)
+    {
+      // TODO: this part could be optimized by read in one WORD and 
+      // use left shift+mask to search the pattern.
+      WORD w = Utils::BitsRead(B, j, j + _patLen - 1) ;
+      if (w == _pat)
+        ++rank ;
+    }
+    return rank ;
+  }
+
+  size_t Select(size_t i, const WORD *B, size_t n) const
+  {
+    size_t j ;
+    size_t v = 0 ;
+    size_t count = 0 ;
+    while (2 * v + 2 < 2 * _r - 1)
+    {
+      if (_counts[2 * v + 1] + count >= i)
+        v = 2 * v + 1 ;
+      else
+      {
+        count += _counts[2 * v + 1] ;
+        v = 2 * v + 2 ;
+      }
+    }
+
+    size_t k = NumLeaf(v) ;
+    for (j = k * _b ; j < (k + 1) * _b ; ++j)
+    {
+      WORD w = Utils::BitsRead(B, j, j + _patLen - 1) ;
+      if (w == _pat)
+        ++count ;
+      if (count == i)
+        return j ; 
+    }
+    return 0 ; 
+  }
+
+  bool IsPattern(size_t i, const WORD *B, size_t n) const
+  {
+    if (i + _patLen - 1 >= n)
+      return false ;
+    return (Utils::BitsRead(B, i, i + _patLen - 1) == _pat) ;
+
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _r) ;
+    SAVE_VAR(fp, _b) ;
+    SAVE_VAR(fp, _height) ;
+    SAVE_VAR(fp, _pat) ;
+    SAVE_VAR(fp, _patLen) ;
+    
+    if (_r > 0)
+      SAVE_ARR(fp, _counts, 2 * _r - 1) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+        
+    LOAD_VAR(fp, _r) ;
+    LOAD_VAR(fp, _b) ;
+    LOAD_VAR(fp, _height) ;
+    LOAD_VAR(fp, _pat) ;
+    LOAD_VAR(fp, _patLen) ;
+
+    if (_r > 0)
+    {
+      _counts = (size_t *)malloc(sizeof(size_t) * (2 * _r - 1)) ;
+      LOAD_ARR(fp, _counts, 2 * _r - 1) ;
+      _space = sizeof(*_counts) * (2*_r - 1) ;
+    }
+  }
+} ;
+
+}
+#endif
diff --git a/compactds/DS_RangeMinMaxTree.hpp b/compactds/DS_RangeMinMaxTree.hpp
new file mode 100644
index 0000000..92e1c03
--- /dev/null
+++ b/compactds/DS_RangeMinMaxTree.hpp
@@ -0,0 +1,920 @@
+#ifndef _MOURISL_COMPACTDS_DS_RANGEMINMMAXTREE
+#define _MOURISL_COMPACTDS_DS_RANGEMINMMAXTREE
+
+// Based on section 7.1.1
+// Handles the excessive information in bit vector
+#include "Utils.hpp"
+
+// Note that the excess can be negative, so we use int64_t in many places.
+// The input B is like:
+//   1 1 1 0 0 0
+// The excess tracking will be
+//  0 1 2 3 2 1 0
+// (The search for index i is always with respect to between i-th B and (i+1)-th B.)
+//  The search for index i is always inclusive.
+//   The forward search will return the effect after j-th B.
+//   The backward search will return the effect before j-th B.
+namespace compactds {
+struct _rangeMinMaxTreeNode
+{
+  // Each number is within the block, so the value range should be small
+  // Assume block size is less than 2^15
+  int16_t e ; // excess with respect to the beginning of the region
+  int16_t min ; // min e
+  int16_t max ; // max e
+  int16_t n ; // number of times hit min
+
+  void Merge(const struct _rangeMinMaxTreeNode &b)
+  {
+    if (e + b.min < min)
+    {
+       min = e + b.min ;
+       n = b.n ;
+    }
+    else if (e + b.min == min)
+      n += b.n ;
+    
+    if (e + b.max > max)
+      max = e + b.max ;
+    e += b.e ;
+  }
+
+  // Wrapper if we need the information from right to left (direction<0)
+  int16_t RevE()
+  {
+    return -e ;
+  }
+
+  int16_t RevMin()
+  {
+    return min <= 0 ? (min - e) : -e ;
+  }
+
+  int16_t RevMax()
+  {
+    return max >= 0 ? (max - e) : -e ;
+  }
+} ;
+
+class DS_RangeMinMaxTree
+{
+private:
+  size_t _space ;
+  size_t _r ; // number of regions
+  size_t _b ; // block size
+  size_t _n ; 
+  size_t _height ;
+  struct _rangeMinMaxTreeNode *_tree ; 
+  int _cwidth ; // chunk size
+  struct _rangeMinMaxTreeNode *_C ; // precomputed chunk
+
+  // Maps leaf id k to the tree idx
+  size_t LeafNum(size_t k) const
+  {
+    const size_t width = 1ull<<_height ;
+    if (k < 2 * _r - width)
+      return width - 1 + k ; 
+    else
+      return width - 1 - _r + k ;
+  }
+
+  // Maps tree index to leaf index 
+  size_t NumLeaf(size_t v) const
+  {
+    const size_t width = 1ull<<_height ;
+    if (v >= width - 1)
+      return v - width + 1 ;
+    else
+      return v - width + 1 + _r ;
+  }
+
+  //@return: the 0-based level of tree index v located at
+  int GetLevel(size_t v) const
+  {
+    return Utils::CountBits(v + 1) - 1 ;
+  }
+
+  // The max tree index that is at the same level as v
+  size_t LevelMaxNum(size_t v) const
+  {
+    size_t ret = (1ull<<(GetLevel(v) + 1)) - 2 ;
+    //if (ret >= 2*_r-1)
+    //  ret = 2*_r - 2 ;
+    return ret ;
+  }
+
+  // The min tree index that is at the same level as v
+  size_t LevelMinNum(size_t v) const
+  {
+    return LevelMaxNum(v) / 2 ;
+  }
+
+  // Find the tree index containing v, l levels higher
+  size_t PromoteLevel(size_t v, int l) const
+  {
+    return ((v+1) >> l) - 1;
+  }
+
+  // Update extreme value (min, max)
+  int64_t UpdateExtreme(int64_t e, int64_t x, int type) const
+  {
+    if ((type == 0 && x < e)
+        || (type == 1 && x > e))
+      return x ;
+    return e ;
+  }
+
+  void InitPrecomputedChunks()
+  {
+    size_t i ;
+    int j ;
+    // Precomputed block
+    _C = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_C) * (1<<_cwidth)) ;
+    _space += sizeof(*_C) * (1<<_cwidth) ;
+
+    // Fill precomputed block
+    for (i = 0 ; i < (1ull<<_cwidth) ; ++i)
+    {
+      int16_t excess = 0 ;
+      int16_t min = 2, max = -2, minCnt = 0 ;
+      for (j = 0 ; j < _cwidth ; ++j)
+      {
+        excess += (2 * ((i >> j) & 1) - 1) ;
+        
+        if (excess < min || minCnt == 0)
+        {
+          min = excess ;
+          minCnt = 1 ;
+        }
+        else if (excess == min)
+          ++minCnt ;
+
+        if (excess > max)
+          max = excess ; 
+      }
+      _C[i].e = excess ;
+      _C[i].min = min ;
+      _C[i].max = max ;
+      _C[i].n = minCnt ;
+    }
+  }
+
+  // Search excess difference after i inside the block containing i (block size is _b)
+  // @return: d, or the excess from i to the end of the block when no match. 
+  //  retj: the coordinate of the matched position, or just pass the block
+  int64_t FwdBlock(size_t i, int64_t d, size_t &retj, const WORD *B) const
+  {
+    size_t j ;
+    
+    size_t f = i / _cwidth ; // current chunk. f:from; t: to.
+    size_t t = ((i / _b + 1) * _b ) / _cwidth - 1 ; // last chunk in the block 
+    
+    int64_t excess = 0 ;
+    // search the current chunk
+    for (j = i ; j < f * _cwidth + _cwidth && j < _n ; ++j)
+    {
+      excess += (2 * Utils::BitRead(B, j) - 1) ;
+      
+      if (excess == d)
+      {
+        retj = j ;
+        return d ;
+      }
+    }
+    
+    // search the remaining chunks in the block
+    size_t p ;
+    for (p = f + 1 ; p <= t ; ++p)
+    {
+      WORD chunk = 0 ;
+      if (p * _cwidth >= _n)
+        break ;
+
+      if ((p + 1) * _cwidth - 1 <= _n - 1)
+        chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth -1) ;
+      else // the chunk will be padding with 0's (\)), which may cause too small m
+        //  but the border case will be handled after searching the hit chunk.
+        chunk = Utils::BitsRead(B, p * _cwidth, _n - 1) ;
+      
+      if ((d <= 0 && excess > d && excess + _C[chunk].min <= d)
+          || (d >= 0 && excess < d && excess + _C[chunk].max >= d)) 
+          break ;
+
+      excess += _C[chunk].e ; 
+    }
+
+    // Could not find it in current block
+    if (p > t)
+    {
+      retj = _cwidth * (t + 1) ; 
+      return excess ; 
+    }
+    if (p * _cwidth >= _n)
+    {
+      retj = _n ;
+      return excess ;
+    }
+    
+    // Search the hit chunk
+    for (j = p * _cwidth ; j < p * _cwidth + _cwidth && j < _n ; ++j)
+    {
+      excess += (2 * Utils::BitRead(B, j) - 1) ;
+      
+      if (excess == d)
+      {
+        retj = j ;
+        return d ;
+      }
+    }
+
+    // Can only reach here in border case
+    retj = _n ; 
+    return excess ;
+  }
+  
+  // Similarly to FwdBlack, but search backwards (to left)
+  int64_t BwdBlock(size_t i, int64_t d, size_t &retj, const WORD *B) const
+  {
+    size_t j ;
+    
+    size_t f = i / _cwidth ; // current chunk
+    size_t t = ((i / _b) * _b ) / _cwidth ; // first chunk in the block 
+    
+    int64_t excess = 0 ;
+    // search the current chunk
+    for (j = i ; j >= f * _cwidth && j < _n ; --j)
+    {
+      excess -= (2 * Utils::BitRead(B, j) - 1) ;
+      
+      if (excess == d)
+      {
+        retj = j ;
+        return d ;
+      }
+    }
+    
+    //search the remaining chunks in the block
+    size_t p ;
+    for (p = f - 1 ; p >= t && p < _n ; --p)
+    {
+      WORD chunk = 0 ;
+
+      chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth -1) ;
+      
+      if ((d <= 0 && excess > d && excess + _C[chunk].RevMin() <= d)
+          || (d >= 0 && excess < d && excess + _C[chunk].RevMax() >= d)) 
+          break ;
+
+      excess += _C[chunk].RevE() ; 
+    }
+
+    // Could not find it in current block
+    if (p < t)
+    {
+      retj = _cwidth * t - 1 ; 
+      return excess ; 
+    }
+    if (p >= _n) // not exist 
+    {
+      retj = _n ;
+      return excess ;
+    }
+    
+    // Search the hit chunk
+    for (j = p * _cwidth + _cwidth - 1 ; j >= p * _cwidth && j < _n ; --j)
+    {
+      excess -= (2 * Utils::BitRead(B, j) - 1) ;
+      if (excess == d)
+      {
+        retj = j ;
+        return d ;
+      }
+    }
+
+    // Can only reach here in border case
+    retj = _n ; 
+    return excess ;
+  }
+
+  // Scanning a block for min/max[i,j]
+  // Assumes i and j are in the same block
+  // As other block searches, it returns the excess in this block,
+  //  the extreme value is passed through the reference
+  int64_t ExtremeBlock(size_t i, size_t j, int type, int64_t &extreme, const WORD *B) const
+  {
+    size_t p ; // the index for loop
+    
+    size_t f = i / _cwidth ; // first chunk 
+    size_t t = j / _cwidth ; // last chunk
+    
+    int64_t excess = 0 ;
+    if (type == 0)
+      extreme = 2 ;
+    else
+      extreme = -2 ;
+    for (p = i ; p < f * _cwidth + _cwidth && p <= j && p < _n ; ++p)
+    {
+      excess += (2 * Utils::BitRead(B, p) - 1) ;
+      extreme = UpdateExtreme(extreme, excess, type) ;
+    }
+    if (f == t)
+      return excess ;
+    
+    // Since we search to t-1, there is no need to worry about reading after _n
+    for (p = f + 1 ; p <= t - 1 ; ++p) 
+    {
+      WORD chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth - 1) ; 
+      extreme = UpdateExtreme(extreme, excess + 
+          (type == 0 ? _C[chunk].min : _C[chunk].max), type) ;
+      excess += _C[chunk].e ;
+    }
+
+    // last chunk
+    for (p = t * _cwidth ; p <= j && p <= _n ; ++p)
+    {
+      excess += (2 * Utils::BitRead(B, p) - 1) ;
+      extreme = UpdateExtreme(extreme, excess, type) ;
+    }
+
+    return excess ;
+  }
+
+  // Given the global min value between [i,j], try to find its count
+  // @return: the excess after j or the block if j is out of the bo
+  int64_t MinCountBlock(size_t i, size_t j, int64_t min, size_t &minCnt, const WORD *B) const
+  {
+    size_t p ; // the index for loop
+
+    size_t f = i / _cwidth ; // first chunk 
+    size_t t = j / _cwidth ; // last chunk
+
+    int64_t excess = 0 ;
+    minCnt = 0 ;
+    
+    for (p = i ; p < f * _cwidth + _cwidth && p <= j && p < _n ; ++p)
+    {
+      excess += (2 * Utils::BitRead(B, p) - 1) ;
+      if (excess == min)
+        ++minCnt ;
+    }
+    if (f == t)
+      return excess ;
+
+    // Since we search to t-1, there is no need to worry about reading after _n
+    for (p = f + 1 ; p <= t - 1 ; ++p) 
+    {
+      WORD chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth - 1) ;
+      if (excess + _C[chunk].min == min)
+        minCnt += _C[chunk].n ;
+      excess += _C[chunk].e ;
+    }
+
+    // last chunk
+    for (p = t * _cwidth ; p <= j && p <= _n ; ++p)
+    {
+      excess += (2 * Utils::BitRead(B, p) - 1) ;
+      if (excess == min)
+        ++minCnt ;
+    }
+
+    return excess ;
+  }
+
+  // Return the excess. The coordinate of the k-th min is returned through selectk. set _n if not found
+  int64_t MinSelectBlock(size_t i, size_t j, int64_t min, size_t kthMin, size_t &selectk, size_t &minCnt, const WORD *B) const
+  {
+    size_t p ; // the index for loop
+
+    size_t f = i / _cwidth ; // first chunk 
+    size_t t = j / _cwidth ; // last chunk
+
+    int64_t excess = 0 ;
+    minCnt = 0 ;
+    
+    for (p = i ; p < f * _cwidth + _cwidth && p <= j && p < _n ; ++p)
+    {
+      excess += (2 * Utils::BitRead(B, p) - 1) ;
+      if (excess == min)
+      {
+        ++minCnt ;
+        if (minCnt == kthMin)
+        {
+          selectk = p ;
+          return excess ;
+        }
+      } 
+    }
+    if (f == t)
+    {
+      selectk = _n ;
+      return excess ;
+    }
+
+    // Since we search to t-1, there is no need to worry about reading after _n
+    for (p = f + 1 ; p <= t - 1 ; ++p) 
+    {
+      WORD chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth - 1) ;
+      if (excess + _C[chunk].min == min)
+      {
+        if (kthMin <= minCnt + _C[chunk].n)
+        {
+          size_t chunki ; 
+          for (chunki = p * _cwidth ; chunki < (p + 1) * _cwidth ; ++chunki)
+          {
+            excess += (2 * Utils::BitRead(B, chunki) - 1) ;
+            if (excess == min)
+            {
+              ++minCnt ;
+              if (minCnt == kthMin)
+              {
+                selectk = chunki ;
+                return excess ;
+              }
+            }
+          }
+        }
+        minCnt += _C[chunk].n ;
+      }
+      excess += _C[chunk].e ;
+    }
+
+    // last chunk
+    for (p = t * _cwidth ; p <= j && p <= _n ; ++p)
+    {
+      excess += (2 * Utils::BitRead(B, p) - 1) ;
+      if (excess == min)
+      {
+        ++minCnt ;
+        if (minCnt == kthMin)
+        {
+          selectk = p ;
+          return excess ;
+        }
+      }
+    }
+
+    return excess ;
+  }
+
+public:
+  DS_RangeMinMaxTree()
+  {
+    _space = _b = 0 ;
+    _cwidth = 8 ;
+    _tree = _C = NULL ;
+  }
+
+  ~DS_RangeMinMaxTree() 
+  {
+    Free() ;
+  }
+
+  size_t GetSpace(bool inclusive = true)
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ; 
+  }
+
+  void Free()
+  {
+    if (_tree != 0)
+    {
+      free(_tree) ;
+      free(_C) ;
+      _tree = NULL ;
+      _C = NULL ;
+    }
+  }
+
+  void SetBlockSize(size_t b)
+  {
+    _b = b ;
+  }
+
+  // s: some special to track the count.
+  // slen: length of the special character
+  void Init(const WORD *B, size_t n)
+  {
+    size_t i, j ;
+    if (_b <= 8) // block size has to be larger than a byte, and should be power of 2
+      _b = 1024 ;
+        
+    _n = n ;
+    _r = DIV_CEIL(n, _b) ; 
+    _height = Utils::Log2Ceil(_r) ;
+    _tree = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_tree) * (2 * _r - 1)) ;
+    _space += sizeof(*_tree) * (2*_r-1) ;
+
+    InitPrecomputedChunks() ;
+    
+    // Initialize the leafs
+    for (i = 0 ; i < n ; i += _b)
+    {
+      size_t treeIdx = LeafNum(i / _b) ;
+      _tree[treeIdx].e = 0 ;
+      _tree[treeIdx].min = 2 ;
+      _tree[treeIdx].max = -2 ;
+      _tree[treeIdx].n = 0 ;
+      for (j = i ; j < n && j < i + _b ; j += _cwidth)
+      {
+        uint64_t chunk = Utils::BitsRead(B, j, j + _cwidth - 1) ;
+        _tree[treeIdx].Merge( _C[chunk] ) ;
+      }
+    }
+
+    // Initialize internal nodes
+    for (i = _r - 2 ; i < _r ; --i)
+    {
+      _tree[i] = _tree[2 * i + 1] ;
+      if (2 * i + 2 < (2 * _r - 1))
+        _tree[i].Merge(_tree[2 * i + 2]) ;
+    }
+  }
+
+  //It's a bit different from the textbook that the FwdSearch 
+  //  in our implementation include the effect from i.
+  //  This makes 0-based indexing have better definition.
+  //@return: the index j >= i that has excess different d 
+  //        return _n if not found
+  size_t FwdSearch(size_t i, int64_t d, const WORD *B, size_t n) const
+  {
+    size_t j ;
+    int64_t excess ;
+    excess = FwdBlock(i, d, j, B) ;
+    
+    if (excess == d)
+      return j ;
+    if (j == _n)
+      return _n ;
+    // Not in current block, so we need to search the tree to find the block
+    //  v is the tree node index.
+    size_t v = LeafNum(i / _b) ;
+    
+    // Go up the tree first
+    // After the iterations, v+1 should be the node containing the target j.
+    // The test for v+1>=2*_r-1 is for the case where the last level is not full (leaf level)
+    //   if it is rightmost leaf on the last level, we can directly go to parent
+    while (v + 1 <= LevelMaxNum(v) && 
+        (v + 1 >= 2*_r-1 || (d <= 0 && excess > d && excess + _tree[v + 1].min > d)
+         || (d >= 0 && excess < d && excess + _tree[v + 1].max < d) )) 
+      // next node block is not enough
+      // The next node not necessarily the brother node, but the covered region is adjacent
+      //    based on the numbering system.
+    {
+      if ((v & 1) == 1) // v is left child. Note that our index is 0-based, so it is not the 2*xxx relation in 1-based index. 
+        excess += _tree[v + 1].e ; 
+      v = (v-1) / 2 ; // parent node
+    }
+    
+    if (v == LevelMaxNum(v)) // Not found. v is the rightmost block on the level
+      return _n ;
+    
+    // Go down the tree to locate the block
+    ++v ;
+    while (2 * v + 2 < 2 * _r - 1)
+    {
+      if ((d <= 0 && excess > d && excess + _tree[2 * v + 1].min <= d) 
+          || (d >= 0 && excess < d && excess + _tree[2 * v + 1].max >= d))
+        v = 2 * v + 1 ;
+      else 
+      {
+        excess += _tree[2 * v + 1].e ;
+        v = 2 * v + 2 ;
+      }
+    }
+    
+    // The else branch above may go beyond the number of blocks
+    if (v >= 2 * _r - 1)  
+      return _n ;
+
+    // Search the target leaf block
+    // The FwdBlock searches things after the index, so we need put -1 in
+    //  NumLeaf(v) * _b
+    excess = FwdBlock(NumLeaf(v) * _b, d - excess, j, B) ;
+    return j ;
+  }
+
+  //@return: the index j <= i that has excess different d comparing with i from right to left 
+  //        return _n if not found
+  size_t BwdSearch(size_t i, int64_t d, const WORD *B, size_t n) const
+  {
+    size_t j ;
+    int64_t excess ;
+    /*if (i == 0)
+    {
+      // d == 1, b0 == 0
+      // or d == -1, b0==1
+      if (d == -2 * Utils::BitRead(B, 0) + 1 )
+        return 0 ;
+      return _n ;
+    }*/
+
+    excess = BwdBlock(i, d, j, B) ;
+    if (excess == d)
+      return j ;
+    if (j == _n)
+      return _n ;
+    size_t v = LeafNum(i / _b) ;
+    
+    // Go up the tree first
+    // After the iterations, v-1 should be the node containing the target j. 
+    while (v != 0 && v - 1 >= LevelMinNum(v) && 
+        ((d <= 0 && excess > d && excess + _tree[v - 1].RevMin() > d)
+         || (d >= 0 && excess < d && excess + _tree[v - 1].RevMax() < d) )) 
+      // next node block is not enough
+      // The next node not necessarily the brother node, but the covered region is adjacent
+      //    based on the numbering system.
+    {
+      if ((v & 1) == 0) // v is right child. Note that our index is 0-based, so it is not the 2*xxx relation in 1-based index. 
+        excess += _tree[v - 1].RevE() ; 
+      v = (v-1) / 2 ; // parent node
+    }
+    
+    if (v == LevelMinNum(v)) // Not found. v is the leftmost block on the level
+      return _n ;
+    
+    // Go down the tree to locate the block
+    --v ;
+    while (2 * v + 2 < 2 * _r - 1)
+    {
+      if ((d <= 0 && excess > d && excess + _tree[2 * v + 2].RevMin() <= d) 
+          || (d >= 0 && excess < d && excess + _tree[2 * v + 2].RevMax() >= d))
+        v = 2 * v + 2 ;
+      else 
+      {
+        excess += _tree[2 * v + 2].RevE() ;
+        v = 2 * v + 1 ;
+      }
+    }
+    
+    // The else branch above may go beyond the number of blocks
+    if (v >= 2 * _r - 1)  
+      return _n ;
+
+    
+    // Search the target leaf block
+    // BwdBlock is inclusive
+    excess = BwdBlock(NumLeaf(v) * _b + _b - 1, d - excess, j, B) ;
+    return j ;
+  }
+
+  // type: 0-min, 1-max
+  //@return: the min/max value in B[i,j]: included the effects from B[i] and B[j]
+  int64_t ExtremeExcess(size_t i, size_t j, int type, const WORD *B, size_t n) const
+  {
+    int64_t extreme = 0 ;
+    int64_t excess = 0 ;
+
+    excess = ExtremeBlock(i, MIN(j, (i / _b) * _b + _b - 1), type, extreme, B);
+    if (j/_b <= i / _b) // in the same block
+      return extreme ;
+
+    //printf("%d %d: %d %d\n", i, j, extreme, excess) ;
+    // Search the tree
+    size_t v = LeafNum(i / _b) ;
+    size_t l = LeafNum(j / _b) ; 
+    int levelv = GetLevel(v) ; 
+    int levell = GetLevel(l) ;
+
+    // Upward search
+    // v+1 > l: l is in the upper level 
+    //  or l is still to the right of v+1
+    while (v + 1 > l || v+1 != PromoteLevel(l, levell - levelv)) 
+    {
+      if ( (v & 1) == 1 && v+1 < 2*_r - 1) //left children
+      {
+        extreme = UpdateExtreme(extreme, excess + 
+            (type == 0 ? _tree[v + 1].min : _tree[v + 1].max), type) ;
+        excess += _tree[v + 1].e ;
+      }
+      v = (v - 1) / 2 ;
+      --levelv ;
+    }
+    //printf("%d %d: %d %d %d\n", i, j, v, extreme, excess) ;
+
+    // Downward search. Now l should be in v+1
+    ++v ;
+    while (v < _r - 1) // internal nodes 
+    {
+      if ((type == 0 && extreme <= excess + _tree[v].min)
+          || (type == 1 && extreme >= excess + _tree[v].max))
+        return extreme ;
+      
+      if (2 * v + 1 != PromoteLevel(l, levell - (levelv + 1)))
+      {
+        extreme = UpdateExtreme(extreme, excess + 
+            (type == 0 ? _tree[2*v + 1].min : _tree[2*v + 1].max), type) ;
+        excess += _tree[2*v + 1].e ;
+        v = 2 * v + 2 ;
+      }
+      else
+        v = 2 * v + 1 ;
+      ++levelv ;
+    }
+    //printf("%d %d: %d %d. %d %d\n", i, j, v, extreme, excess, _tree[v].min) ;
+
+    if ((type == 0 && extreme <= excess + _tree[v].min)
+        || (type == 1 && extreme >= excess + _tree[v].max))
+      return extreme ;
+
+    // last block
+    int64_t lastExtreme = 0 ;
+    ExtremeBlock((j / _b) * _b, j, type, lastExtreme, B) ;
+    //printf("%d %d: %d vs %d %d\n", i, j, extreme, excess, lastExtreme) ;
+
+    return UpdateExtreme(extreme, excess + lastExtreme, type) ;
+  }
+  
+  // Leftmost position of a minimum in excess(B, i, j)
+  size_t Rmq(size_t i, size_t j, const WORD *B, size_t n) const
+  {
+    int64_t min = ExtremeExcess(i, j, 0, B, n) ; 
+    return FwdSearch(i, min, B, n) ;
+  }
+
+  // Leftmost position of a maximum in excess(B, i, j)
+  size_t RMq(size_t i, size_t j, const WORD *B, size_t n) const
+  {
+    int64_t max = ExtremeExcess(i, j, 1, B, n) ; 
+    return FwdSearch(i, max, B, n) ;
+  }
+
+  // Need .maxn in node structure to support maxcount but not implemented now,
+  //   depends on future application to decide whether implement this feature.
+  size_t MinCount(size_t i, size_t j, const WORD *B, size_t n) const
+  {
+    // The min in this whole range
+    int64_t min = ExtremeExcess(i, j, 0, B, n) ; 
+    
+    // Get first block
+    size_t minCnt = 0 ;
+    int64_t excess = 0 ;
+
+    excess = MinCountBlock(i, MIN(j, (i / _b) * _b + _b - 1), min, minCnt, B);
+    if (j/_b <= i / _b) // in the same block
+      return minCnt ;
+    //printf("%d: %d %d\n", i, min, minCnt) ;
+    // Search the tree
+    size_t v = LeafNum(i / _b) ;
+    size_t l = LeafNum(j / _b) ; 
+    int levelv = GetLevel(v) ; 
+    int levell = GetLevel(l) ;
+
+    // Upward search
+    // v+1 > l: l is in the upper level 
+    //  or l is still to the right of v+1
+    while (v + 1 > l || v+1 != PromoteLevel(l, levell - levelv)) 
+    {
+      if ( (v & 1) == 1 && v+1 < 2*_r - 1) //left children
+      {
+        if (excess + _tree[v + 1].min == min)
+          minCnt += _tree[v + 1].n ;
+        excess += _tree[v + 1].e ;
+      }
+      v = (v - 1) / 2 ;
+      --levelv ;
+    }
+    //printf("%d: %d %d\n", i, v, excess) ;
+    // Downward search. Now l should be in v+1
+    ++v ;
+    while (v < _r - 1) // internal nodes 
+    {
+      if (min < excess + _tree[v].min)
+        return minCnt ;
+      
+      if (2 * v + 1 != PromoteLevel(l, levell - (levelv + 1)))
+      {
+        if (excess + _tree[2*v + 1].min == min)
+          minCnt += _tree[2*v + 1].n ;
+        excess += _tree[2*v + 1].e ;
+        v = 2 * v + 2 ;
+      }
+      else
+        v = 2 * v + 1 ;
+      ++levelv ;
+    }
+
+    //printf("%d: %d %d. %d %d %d\n", i, min, minCnt, excess, v, _tree[v].min) ;
+    if (min < excess + _tree[v].min)
+      return minCnt ;
+    
+    // last block
+    size_t lastMinCnt = 0 ;
+    // Notice the we need to use min-excess here to adjust the excess so far.
+    MinCountBlock((j / _b) * _b, j, min - excess, lastMinCnt, B) ;
+
+    //printf("%d: %d %d %d\n", i, min, minCnt, lastMinCnt) ;
+    return minCnt + lastMinCnt ;
+  }
+
+  // Select the t-th (1-based) minimum element in B[i..j]
+  size_t MinSelect(size_t i, size_t j, size_t t, const WORD *B, size_t n) const
+  {
+    // The min in this whole range
+    int64_t min = ExtremeExcess(i, j, 0, B, n) ; 
+    
+    // Get first block
+    size_t minCnt = 0 ;
+    int64_t excess = 0 ;
+    size_t ret = _n ;
+
+    excess = MinSelectBlock(i, MIN(j, (i / _b) * _b + _b - 1), min, t, ret, minCnt, B);
+    //printf("%d: %d %d\n", i, ret, minCnt) ;
+    if (j/_b <= i / _b // in the same block
+        || ret < _n ) // already found
+      return ret ;
+    
+    // Search the tree
+    size_t v = LeafNum(i / _b) ;
+    size_t l = LeafNum(j / _b) ; 
+    int levelv = GetLevel(v) ; 
+    int levell = GetLevel(l) ;
+
+    //printf("%d: %d %d. %d %d\n", i, min, minCnt, v, _r) ;
+    // Upward search
+    // v+1 > l: l is in the upper level 
+    //  or l is still to the right of v+1
+    while (v + 1 > l || v+1 != PromoteLevel(l, levell - levelv)) 
+    {
+      if (v + 1 < 2 * _r - 1 && excess + _tree[v + 1].min == min
+          && minCnt + _tree[v + 1].n >= t)
+          break ;
+
+      if ( (v & 1) == 1 && v+1 < 2*_r - 1) //left children
+      {
+        if (excess + _tree[v + 1].min == min)
+          minCnt += _tree[v + 1].n ;
+        excess += _tree[v + 1].e ;
+      }
+      v = (v - 1) / 2 ;
+      --levelv ;
+    }
+    
+    if (v == LevelMaxNum(v)) // Not found. v is the rightmost block on the level
+      return _n ;
+
+    //printf("%d: %d %d %d\n", i, v, excess, minCnt) ;
+    // Downward search. 
+    ++v ;
+    while (v < _r - 1) // internal nodes 
+    {
+      if (min < excess + _tree[v].min)
+        return ret ;
+      
+      if ( 2 * v + 1 != PromoteLevel(l, levell - (levelv + 1)) //j is in the right chilad 
+          && (excess + _tree[2 * v + 1].min != min 
+            || minCnt + _tree[2 * v+1].n < t)) // left child could not reach t.
+      {
+        if (excess + _tree[2*v + 1].min == min)
+          minCnt += _tree[2*v + 1].n ;
+        excess += _tree[2*v + 1].e ;
+        v = 2 * v + 2 ;
+      }
+      else
+        v = 2 * v + 1 ;
+
+      ++levelv ;
+    }
+
+    //printf("%d: %d %d. %d %d %d\n", i, min, minCnt, excess, v, _tree[v].min) ;
+    if (min < excess + _tree[v].min)
+      return _n ;
+    
+    // last block
+    size_t lastMinCnt = 0 ;
+    // Notice the we need to use min-excess here to adjust the excess so far.
+    v = NumLeaf(v) ;
+    MinSelectBlock(v * _b, MIN(j, (v+1) * _b - 1), min - excess, t - minCnt, ret, lastMinCnt, B) ;
+
+    //printf("%d: %d %d %d %d %d\n", i, v, min, minCnt, lastMinCnt, ret) ;
+    return ret ;
+  }
+  
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _r) ;
+    SAVE_VAR(fp, _b) ;
+    SAVE_VAR(fp, _n) ;
+    SAVE_VAR(fp, _height) ;
+    SAVE_VAR(fp, _cwidth) ;
+    SAVE_ARR(fp, _tree, 2 * _r - 1) ;
+    SAVE_ARR(fp, _C, 1 << _cwidth) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    LOAD_VAR(fp, _r) ;
+    LOAD_VAR(fp, _b) ;
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _height) ;
+    LOAD_VAR(fp, _cwidth) ;
+    _tree = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_tree) * (2 * _r - 1)) ;
+    LOAD_ARR(fp, _tree, 2 * _r - 1) ;
+    _C = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_C) * (1<<_cwidth)) ;
+    LOAD_ARR(fp, _C, 1 << _cwidth) ;
+    _space = sizeof(struct _rangeMinMaxTreeNode) * (2 * _r - 1 + (1<<_cwidth)) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/DS_Rank.hpp b/compactds/DS_Rank.hpp
new file mode 100644
index 0000000..dbe17ff
--- /dev/null
+++ b/compactds/DS_Rank.hpp
@@ -0,0 +1,298 @@
+#ifndef _MOURISL_COMPACTDS_DS_RANK
+#define _MOURISL_COMPACTDS_DS_RANK
+
+#include "Utils.hpp"
+#include "FixedSizeElemArray.hpp"
+
+// The standalone data structe for rank query on a plain bitvector
+// Time complexity: constant time 
+// Extra space complexity (bits): n/b + n/w*log(bw)
+namespace compactds {
+class DS_Rank
+{
+private:
+  uint64_t *_R ; // the partial sum of 1s for blocks of the bit vector (right exclusive)
+  FixedSizeElemArray _subR ; // the partial sum within each block for constant access
+  int _b ; // block size, with respective to the word.
+  int _bshift ; // the number of bits-1 of b
+  size_t _wordCnt ;
+  size_t _space ;
+public:
+  DS_Rank() 
+  {
+    _R = NULL ;
+    _b = _space = 0 ;
+  }
+
+  DS_Rank(int blockSize, const WORD *B, const int &n) 
+  {
+    Init(blockSize, B, n) ;
+  }
+
+  ~DS_Rank() { Free() ; }
+
+  void Free()
+  {
+    if (_R != NULL)
+    {
+      free(_R) ;
+      _R = NULL ;
+    }
+    _b = 0 ;
+  }
+  
+  size_t GetSpace() { return _space + sizeof(*this); } 
+
+  // blockSize is the number of WORDs for each R 
+  void Init(int blockSize, const WORD *B, const size_t &n)
+  {
+    size_t i ;
+    _b = blockSize ;
+    if (_b <= 0)
+      _b = WORDBITS ;
+
+    i = _b >> 1 ;
+    for (_bshift = 0 ; i != 0 ; i >>= 1, ++_bshift) 
+      ;
+      
+    _wordCnt = Utils::BitsToWords(n) ;
+    size_t blockCnt = DIV_CEIL(_wordCnt, _b) ;
+    _R = (uint64_t *)malloc(sizeof(uint64_t) * blockCnt) ;
+    _space += sizeof(uint64_t) * blockCnt ;
+    _subR.Malloc(Utils::Log2Ceil((_b-1)*WORDBITS), _wordCnt - blockCnt) ; // we don't need to store the first sub-block in each block
+    _space += _subR.GetSpace() - sizeof(_subR) ;
+    uint64_t onecntSum = 0 ;
+    size_t localOneCntSum = 0 ;
+    for (i = 0 ; i < _wordCnt ; ++i)
+    {
+      if (i % _b == 0)
+      {
+        _R[i/_b] = onecntSum ;
+        localOneCntSum = 0 ;
+      }
+      else
+      {
+        _subR.Write(i - i / _b - 1, localOneCntSum) ;
+      }
+      int onecnt = Utils::Popcount(B[i]) ;
+      onecntSum += onecnt ;
+      localOneCntSum += onecnt ;
+    }
+  }
+  
+  int GetBlockSize() const // unit in word
+  {
+    return _b ;
+  }
+
+  int GetSubBlockSize() const
+  {
+    return WORDBITS ;  
+  }
+
+  uint64_t *GetR() const 
+  {
+    return _R ;
+  }
+
+  const FixedSizeElemArray *GetSubR() const
+  {
+    return &_subR ;
+  }
+
+
+  size_t Query(size_t i, const WORD *B, const size_t &n, int inclusive = 1) const
+  {
+    if (i >= n)
+      return Query(n - 1, B, n, inclusive) ;
+
+    size_t wi = i >> WORDBITS_WIDTH ;
+    return _R[wi >> _bshift] + ((wi&(_b - 1)) ? _subR.Read(wi - (wi >> _bshift) - 1) : 0) 
+      + Utils::Popcount(B[wi] & ((MASK(i&(WORDBITS - 1))<<inclusive) + inclusive)) ;
+      //+ Utils::Popcount(B[wi] & MASK_WCHECK((i&(WORDBITS - 1)) + inclusive)) ;
+    /*else
+    {
+      // The implemenation without _subR
+      size_t j ;
+      uint64_t onecntSum = _R[wi / b] ;
+      for (j = (wi / b) * _b ; (j + 1) * WORDBITS <= i ; ++j)
+        onecntSum += Utils::Popcount(B[j]) ;     
+      return onecntSum + Utils::Popcount(B[j] & MASK(i % WORDBITS + inclusive)) ;
+    }*/
+  }
+  
+  void Save(FILE *fp)
+  {
+    fwrite(this, sizeof(*this), 1, fp) ;
+    size_t blockCnt = DIV_CEIL(_wordCnt, _b) ;
+    fwrite(_R, sizeof(_R[0]), blockCnt, fp) ;
+    _subR.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    fread(this, sizeof(*this), 1, fp) ;
+    size_t blockCnt = DIV_CEIL(_wordCnt, _b) ;
+    if (_R != NULL)
+      free(_R) ;
+    _R = (uint64_t *)malloc(sizeof(uint64_t) * blockCnt * 2) ;
+    fread(_R, sizeof(_R[0]), blockCnt, fp) ;
+    _subR.Load(fp) ;
+  }
+
+} ;
+
+// Optimized for recording offset every 8 words.
+class DS_Rank9
+{
+private:
+  uint64_t *_R ; // the partial sum of 1s for blocks of the bit vector (right exclusive), even position for large block, odd positins for compact subblock
+  size_t _wordCnt ;
+  size_t _space ;
+public:
+  DS_Rank9() 
+  {
+    _R = NULL ;
+    _space = 0 ;
+  }
+
+  DS_Rank9(const WORD *B, const int &n) 
+  {
+    Init(B, n) ;
+  }
+
+  ~DS_Rank9() { Free() ; }
+
+  void Free()
+  {
+    if (_R != NULL)
+    {
+      free(_R) ;
+      _R = NULL ;
+    }
+  }
+  
+  size_t GetSpace() { return _space + sizeof(*this); }
+
+  int GetBlockSize() const
+  {
+    return 8 ;
+  }
+
+  int GetSubBlockSize() const
+  {
+    return WORDBITS ;
+  }
+  
+  size_t GetWordCnt() const
+  {
+    return _wordCnt ;
+  }
+ 
+  // wi is the word index
+  size_t GetRValue(size_t wi) const
+  {
+    const size_t ri = (wi >> 3) * 2 ; // region/block id
+    const size_t t = (wi & 7) - 1 ; // the offset in the subblock
+    return _R[ri] + ((_R[ri + 1]>> ((t + ((t>>60)&8))*9)) & 0x1ff) ; 
+  }
+
+  uint64_t *GetR() const
+  {
+    return _R ;
+  }
+
+  // blockSize is the number of WORDs for each R 
+  void Init(const WORD *B, const size_t &n)
+  {
+    size_t i ;
+    const int b = 8 ; // number of word in each block 
+    const int subrWidth = Utils::Log2Ceil((b-1) * 64) ; // should equal to 9 
+    _wordCnt = Utils::BitsToWords(n) ;
+    size_t blockCnt = DIV_CEIL(_wordCnt, b) ;
+    _R = (uint64_t *)calloc(blockCnt * 2, sizeof(uint64_t)) ;
+    _space = sizeof(uint64_t) * blockCnt * 2 ;
+    uint64_t onecntSum = 0 ;
+    size_t localOneCntSum = 0 ;
+    for (i = 0 ; i < _wordCnt ; ++i)
+    {
+      size_t bi = i/b * 2 ; // block index
+      int br = i % b ; //remainder
+      if (br == 0)
+      {
+        _R[bi] = onecntSum ;
+        _R[bi + 1] = 0 ;
+        localOneCntSum = 0 ;
+      }
+      else
+      {
+        _R[bi + 1] |= (localOneCntSum << ((br - 1)*subrWidth))  ;
+      }
+      int onecnt = Utils::Popcount(B[i]) ;
+      onecntSum += onecnt ;
+      localOneCntSum += onecnt ;
+    }
+    // Fill in the remaining subr blocks
+    //   so other module don't need to worry
+    //   too much about boundary case
+    if ((i-1) % b > 0)
+    {
+      size_t bi = i/b * 2 ; // block index
+      for ( ; i % b ; ++i)
+      {
+        int br = i % b ;
+        _R[bi + 1] |= (localOneCntSum << ((br - 1)*subrWidth))  ;
+      }
+    }
+  }
+  
+  // read the si-th subblock in block bi
+  int DecodeSubR(size_t bi, size_t si) const
+  {
+    return (_R[2 * bi + 1] >> (si * 9)) & 0x1ff;
+  }
+  
+  size_t Query(size_t i, const WORD *B, const size_t &n, int inclusive = 1) const
+  {
+    if (i >= n)
+      return Query(n - 1, B, n, inclusive) ;
+
+    const size_t wi = (i>>WORDBITS_WIDTH) ; // word id
+    const size_t ri = (wi >> 3) * 2 ; // region/block id
+    const size_t t = (wi & 7) - 1 ; // the offset in the subblock
+    // 0x1ff is the mask for 9 bit
+    // The ((t>>60)&8))*9) portion is to avoid branching when wi%8 == 0
+    //  In this case, t=0xffff.., and (t + ((t>>60)&8))*9) == 63
+    //  and the top bit of _R[ri+1] is 0, which makes the whole portion == 0
+    // The implementation of inclusive also avoids branching using property that 
+    //  inclusive variable is binary.
+    return _R[ri] + ((_R[ri + 1]>> ((t + ((t>>60)&8))*9)) & 0x1ff) 
+      + Utils::Popcount(B[wi] & ((MASK(i&(WORDBITS - 1))<<inclusive) + inclusive)) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _space) ;
+    SAVE_VAR(fp, _wordCnt) ;
+    const int b = 8 ;
+    size_t blockCnt = DIV_CEIL(_wordCnt, b) ;
+    fwrite(_R, sizeof(_R[0]), blockCnt * 2, fp) ; 
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    LOAD_VAR(fp, _space) ;
+    LOAD_VAR(fp, _wordCnt) ;
+    const int b = 8 ;
+    size_t blockCnt = DIV_CEIL(_wordCnt, b) ;
+    if (_R != NULL)
+      free(_R) ;
+    _R = (uint64_t *)malloc(sizeof(uint64_t) * blockCnt * 2) ;
+    fread(_R, sizeof(_R[0]), blockCnt * 2, fp) ; 
+  }
+} ;
+}
+
+#endif 
diff --git a/compactds/DS_Select.hpp b/compactds/DS_Select.hpp
new file mode 100644
index 0000000..f0a8301
--- /dev/null
+++ b/compactds/DS_Select.hpp
@@ -0,0 +1,753 @@
+#ifndef _MOURISL_COMPACTDS_DS_SELECT
+#define _MOURISL_COMPACTDS_DS_SELECT
+
+#include "Utils.hpp"
+#include "DS_Rank.hpp"
+
+// The standalone data structe for select query on a plain bitvector with precomputed rank information 
+// _n - bitvector length. m - number of 1s (or 0s for select0)
+// Speed 1: Time complexity: O(log n/m) [_space: O(n/w)]
+// Speed 2: Time complexity: O(log log n) [_space: O(n/log n)] 
+//          Most of the _space are reuse the rank structure though, 
+//          so in practice the extra _space is stil only O(n/w)
+// Speed 3: Time complexity: O(log log n) [_space: O(n/log n)]
+//          inspired by the implementation in SDSL 
+// Seems speed 4 does not work properly...
+// Speed 4: Time complexity: O(1)  [_space: O(n loglog _n / sqrt(log n) + sqrt(n))]
+//          The textbook O(n/log log n)-_space algorithm has too large factor
+//          for precomputed short _miniblocks. Not very pratical.
+//
+
+#define DS_SELECT_SPEED_NO 0
+#define DS_SELECT_SPEED_SAMPLED 1
+#define DS_SELECT_SPEED_RANKBINARY 2
+#define DS_SELECT_SPEED_DENSESAMPLE 3
+#define DS_SELECT_SPEED_CONSTANT 4
+
+namespace compactds {
+class DS_Select
+{
+private:
+  size_t *_S[2] ; // sampled position for 0's and 1's
+
+  // Data structures for long blocks
+  size_t _longBlockLength ;
+  WORD *_V[2] ; // indicator whether a S block is long (1) or short
+  DS_Rank9 _rankV[2] ;
+  FixedSizeElemArray _I[2] ; // precomputed index within long block
+
+  int _precomputeb ; // the precomputed offsets within a word of size b
+  int _precomputebElem ; // how many 1s we should consider for such word.
+
+  int _minib ; // mini block size (the number of 1's) 
+  size_t _longMiniBlockLength ; // long mini block length, for _speed==3,4
+  WORD *_Vmini[2] ; // indicator whether a S block is long mini or not
+  size_t _VminiSize[2] ;
+  DS_Rank9 _rankVmini[2] ;
+  FixedSizeElemArray _Imini[2] ; // offset for the beginning of mini block 
+  FixedSizeElemArray _Ilongmini[2] ; // offset for each element in long-mini block
+    
+  // Concatenated precomputed short mini block's S. We need concatenation, otherwise too 
+  // much overhead in the FixedSizeElemArray structure. 
+  // Even without FixedSizeElemArray, the pointers will take too much _space.
+  FixedSizeElemArray _precomputedShortMiniBlock[2] ; 
+
+  int _b ; // block size (the number of 1's in a block) or sampling rate
+  size_t _n ;
+  size_t _totalOneCnt ; 
+   
+  size_t _space ;
+  
+  int _speed ; // 0: do not allocate; 1: slow, 2: medium, 3: medium-fast 4: fastest, constant time
+
+  // The select that handles both types (0 or 1).
+  size_t GeneralQuery(size_t i, const DS_Rank &rank, const WORD *B, const size_t &n, int type) const
+  {
+    if (i < 1 || i > _n)
+      return POSITIVE_INF ;
+    if (_n == 1) // i must == 1 here
+      return 1 ;
+
+    size_t si = (i - 1) / _b ;
+
+    size_t l, m, r ; // variables for binary search. They are coordinates on B 
+    l = _S[type][si] ;
+    r = _S[type][si + 1] ;
+    if ((i - 1) % _b == 0) 
+      return l ;
+    if (_speed == 1 || r - l < _longBlockLength) // r-l is more efficient than V.access.
+    {
+      if (_speed == 3)
+      {
+        // Adjust l, r using _miniblocks
+        size_t oldl = l ;
+        l = oldl + _Imini[type].Read((i - 1) / _minib) ;
+        if ((i - 1) % _minib + 1 < (unsigned int)(_b / _minib)) // only adjust r if it is not in the last _miniblock in current block
+          r = oldl + _Imini[type].Read((i - 1) / _minib + 1) ;
+      }
+      if (_speed <= 3)
+      {
+        --r ;
+
+        // Locate the R
+        uint64_t *rankR = rank.GetR() ; // rankR is right open
+        int rankBlockSize = rank.GetBlockSize() ; // this block size is with respect to WORD 
+        size_t rl, rr ; 
+        size_t tmp ;
+        rl = l / (rankBlockSize * WORDBITS) ;
+        rr = r / (rankBlockSize * WORDBITS) ;
+        while (rl <= rr)
+        {
+          m = (rl + rr) / 2 ;
+          tmp = rankR[m] ;
+          if (type == 0)
+            tmp = m * rankBlockSize * WORDBITS - tmp ; 
+          
+          if (tmp < i)
+            rl = m + 1 ;
+          else
+            rr = m - 1 ; // rankR[0]==0 makes sure m>=1 in the process
+        }
+
+        // Locate the subR
+        size_t remaining ;
+        if (type == 1)
+          remaining = i - rankR[rr] ;
+        else
+          remaining = i - (rr * (rankBlockSize * WORDBITS) - rankR[rr]) ;
+        size_t subrl, subrr, fixedSubrl ;
+        int rankSubBlockSize = rank.GetSubBlockSize() ; // This block size is with respecto to bits
+        const FixedSizeElemArray &rankSubR = *(rank.GetSubR()) ;
+       
+        subrl = rr * (rankBlockSize * WORDBITS / rankSubBlockSize - 1) ; // the first subr has offset 0, so we don't store them.
+        subrr = subrl + rankBlockSize * WORDBITS / rankSubBlockSize - 2 ;
+        if (subrr >= rankSubR.GetSize())
+          subrr = rankSubR.GetSize() - 1 ;
+        bool inFirstSubBlock = false ;
+        if (rankSubR.GetSize() == 0 || (type == 1 && (uint32_t)rankSubR.Read(subrl) >= remaining)
+            || (type == 0 && rankSubBlockSize - (uint32_t)rankSubR.Read(subrl) >= remaining)
+            || subrl >= rankSubR.GetSize()) // The case that the last block has only one subblock, which will not be allocated
+          inFirstSubBlock = true ;
+        
+        fixedSubrl = subrl ;
+        if ( !inFirstSubBlock )
+        {
+          while (subrl <= subrr)
+          {
+            m = (subrl + subrr) / 2 ;
+            tmp = rankSubR.Read(m) ;
+            if (type == 0)
+              tmp = (m - fixedSubrl + 1) * rankSubBlockSize - tmp ; // plus 1 here to incorporate the first sub block
+            if (tmp < remaining)
+              subrl = m + 1 ;
+            else
+              subrr = m - 1 ; // the in firstsubblock test makes sure this part won't under-flow 
+          }
+          
+          if (type == 1)
+            remaining -= rankSubR.Read(subrr) ;
+          else
+            remaining -= ((subrr - fixedSubrl + 1) * rankSubBlockSize - rankSubR.Read(subrr)) ;
+        }
+
+        // Processing the last WORD
+        size_t lastWi = 0 ; // index of the last word
+        WORD lastW = 0 ;
+        if (inFirstSubBlock)
+          lastWi = rr * rankBlockSize ;
+        else
+          lastWi = rr + (subrr + 1) * (rankSubBlockSize / WORDBITS) ; // here the rr is to compensate for the first subblock missed in every sub block
+        lastW = B[lastWi] ;
+        size_t j ;
+        
+        int sum = 0 ;
+        for (j = 0 ; j < WORDBITS ; j += _precomputeb)
+        {
+          WORD x = (lastW >> j) & MASK(_precomputeb) ;
+          int tmp = Utils::Popcount(x) ;
+          if (type == 0)
+            tmp = _precomputeb - tmp ;
+          if (sum + tmp >= (int)remaining)
+          {
+            return lastWi * WORDBITS + j + _precomputedShortMiniBlock[type].Read(x * _precomputebElem + remaining - sum - 1) ;
+          }
+          sum += tmp ;
+        }
+        return POSITIVE_INF ; // should not reach here.
+      }
+      else // _speed >= 4
+      {
+        size_t skippedMiniBlocksInLong = _rankV[type].Query(si, _V[type], n, 0) * (_b/_minib) ;
+        size_t iMini = (i - 1) / _minib - skippedMiniBlocksInLong ; 
+        if ( Utils::BitRead(_Vmini[type], iMini) )
+        {
+          // long mini block
+          if ((i-1) % _minib == 0)
+          {
+            return l + _Imini[type].Read(iMini) ;
+          }
+          else
+          {
+            size_t iLongMini = _rankVmini[type].Query(iMini - skippedMiniBlocksInLong,
+                  _Vmini[type], _VminiSize[type], 0) ;
+            //printf("%d\n", iLongMini * (_minib - 1) + (i - 1)%_minib - 1) ;
+            /*printf("b=%d _minib=%d. i=%d iMini=%d skippedMini=%d iLongMini=%d l=%d _Imini[iMini]=%d x=%d _Ilongmini[x]=%d. ret=%d\n", 
+                b, _minib, i, iMini, skippedMiniBlocksInLong, 
+                iLongMini, l, _Imini[type].Read(iMini), 
+                iLongMini * (_minib - 1) + (i-1)%_minib - 1, _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1)%_minib - 1), 
+                l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1)) ;*/ 
+            return l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1) ; 
+          }
+        }
+        else
+        {
+          // short mini block
+          size_t offset = l + _Imini[type].Read(iMini) ;
+          WORD localw = Utils::BitsRead(B, offset, offset + _longMiniBlockLength - 2) ;
+          return offset + _precomputedShortMiniBlock[type].Read(localw * _minib + (i-1)%_minib) ;
+        }
+      }
+    }
+    else
+    {
+      // long block with sparse 1's
+      size_t iI = (_rankV[type].Query(si, _V[type], n) - 1) * (_b - 1); // block index in I
+      //printf("long block %d %d %d %d %d. %d\n", i, b, si, Utils::BitRead(V[type], si), iI, _I[type].Read(iI + (i - 1)%b - 1)) ;
+      return _I[type].Read(iI + (i - 1)%_b - 1) ;
+    }
+  }
+  
+  // The select that handles both types (0 or 1). Optimized for rank9
+  size_t GeneralQuery(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n, int type) const
+  {
+    if (i < 1 || i > _n)
+      return POSITIVE_INF ;
+    if (_n == 1) // i must == 1 here
+      return 1 ;
+    
+    size_t si = (i - 1) / _b ;
+
+    size_t l, m, r ; // variables for binary search. They are coordinates on B 
+    l = _S[type][si] ;
+    r = _S[type][si + 1] ;
+    if ((i - 1) % _b == 0) 
+      return l ;
+    if (_speed == 1 || r - l < _longBlockLength) // r-l is more efficient than V.access.
+    {
+      if (_speed == 3)
+      {
+        // Adjust l, r using _miniblocks
+        size_t oldl = l ;
+        const int factor = _b / _minib ; // there are factor _miniblocks in each block
+        l = oldl + _Imini[type].Read((i - 1) / _minib) ;
+        if ( (int)((i - 1)/_minib) % factor + 1 < factor // only adjust r if it is not in the last _miniblock in current block
+           && r < _n  // and the _miniblock is not the last _miniblock in the all the array 
+            ) 
+          r = oldl + _Imini[type].Read((i - 1) / _minib + 1) ;
+      }
+      if (_speed <= 3)
+      {
+        --r ;
+
+        // Locate the R
+        uint64_t *rankR = rank.GetR() ;
+        const int rankBlockSize = rank.GetBlockSize() ; // this block size is with respect to WORD 
+        size_t rl, rr ; 
+        size_t tmp ;
+        rl = l / (rankBlockSize * WORDBITS) ;
+        rr = r / (rankBlockSize * WORDBITS) ;
+        while (rl <= rr)
+        {
+          m = (rl + rr) / 2 ;
+          tmp = rankR[m << 1] ;
+          if (type == 0)
+            tmp = m * rankBlockSize * WORDBITS - tmp ; 
+          
+          if (tmp < i)
+            rl = m + 1 ;
+          else
+            rr = m - 1 ; // rankR[0]==0 makes sure m>=1 in the process
+        }
+
+        // Locate the subR
+        size_t remaining ;
+        if (type == 1)
+          remaining = i - rankR[rr<<1] ;
+        else
+          remaining = i - (rr * (rankBlockSize * WORDBITS) - rankR[rr<<1]) ;
+        if (remaining == 512) // Happens only when the block is all 1 and we are query the last element. Number 512 requires more than 9 bits to represent
+        {
+          return rr * rankBlockSize * WORDBITS + 511 ;
+        }
+        // Mark the lowest bit for every 9-bit block
+        const uint64_t l9 = 0x40201008040201ull ;
+        const uint64_t h9 = l9 << 8 ; // mark the highest bit for every 9-bit block
+        const uint64_t expandRem = remaining * l9 ;
+        uint64_t subrWord = rankR[rr * 2 + 1] ;
+        if (type == 0) // need to take corresponding complement to get the accumulate counts for 0
+        {
+          //64 + ((64*2)<<(9*1)) + ((64*3)<<(9*2)) + ((64*4)<<(9*3)) + ((64*5)<<(9*4)) + ((64*6)<<(9*5)) + ((64*7)<<(9*6)) = 0x7030140803000040ull
+          subrWord = 0x7030140803010040ull - subrWord ;
+        }
+        uint64_t bitblockComp = BITBLOCK_LT(subrWord, expandRem, h9) ;
+        size_t subrr = (((bitblockComp >> 8) * l9) >> 54ull) & 7ull ;
+        // Processing the last WORD
+        size_t lastWi ; // index of the last word
+        lastWi = rr * rankBlockSize + subrr ;
+        if (lastWi >= rank.GetWordCnt())
+        {
+          lastWi = rank.GetWordCnt() - 1 ;
+          subrr = lastWi - rr * rankBlockSize ;
+        }
+        WORD lastW = B[lastWi] ;
+        if (subrr > 0)
+        {
+          remaining -= ((subrWord >> ((subrr-1) * 9)) & 0x1ff) ;
+        }
+        
+        if (type == 0)
+          lastW = ~lastW ;
+        
+        return lastWi * WORDBITS + Utils::SelectInWord(lastW, remaining) ;
+        //return POSITIVE_INF ; // should not reach here.
+      }
+      else // _speed >= 4
+      {
+        size_t skippedMiniBlocksInLong = _rankV[type].Query(si, _V[type], n, 0) * (_b/_minib) ;
+        size_t iMini = (i - 1) / _minib - skippedMiniBlocksInLong ; 
+        if ( Utils::BitRead(_Vmini[type], iMini) )
+        {
+          // long mini block
+          if ((i-1) % _minib == 0)
+          {
+            return l + _Imini[type].Read(iMini) ;
+          }
+          else
+          {
+            size_t iLongMini = _rankVmini[type].Query(iMini - skippedMiniBlocksInLong,
+                  _Vmini[type], _VminiSize[type], 0) ;
+            //printf("%d\n", iLongMini * (_minib - 1) + (i - 1)%_minib - 1) ;
+            /*printf("b=%d _minib=%d. i=%d iMini=%d skippedMini=%d iLongMini=%d l=%d _Imini[iMini]=%d x=%d _Ilongmini[x]=%d. ret=%d\n", 
+                b, _minib, i, iMini, skippedMiniBlocksInLong, 
+                iLongMini, l, _Imini[type].Read(iMini), 
+                iLongMini * (_minib - 1) + (i-1)%_minib - 1, _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1)%_minib - 1), 
+                l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1)) ;*/ 
+            return l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1) ; 
+          }
+        }
+        else
+        {
+          // short mini block
+          size_t offset = l + _Imini[type].Read(iMini) ;
+          WORD localw = Utils::BitsRead(B, offset, offset + _longMiniBlockLength - 2) ;
+          return offset + _precomputedShortMiniBlock[type].Read(localw * _minib + (i-1)%_minib) ;
+        }
+      }
+    }
+    else
+    {
+      // long block with sparse 1's
+      size_t iI = (_rankV[type].Query(si, _V[type], n) - 1) * (_b - 1); // block index in I
+      //printf("long block %d %d %d %d %d. %d\n", i, b, si, Utils::BitRead(V[type], si), iI, _I[type].Read(iI + (i - 1)%b - 1)) ;
+      return _I[type].Read(iI + (i - 1)%_b - 1) ;
+    }
+  }
+public:
+  DS_Select() 
+  {
+    _S[0] = _S[1] = NULL ;
+    _V[0] = _V[1] = NULL ;
+    _Vmini[0] = _Vmini[1] = NULL ;
+    _n = _totalOneCnt = _b = _space = 0 ;
+  }
+
+  DS_Select(int blockSize, const WORD *B, const int &n, int selectSpeed, int selectTypeSupport) 
+  {
+    Init(blockSize, B, n, selectSpeed, selectTypeSupport) ;
+  }
+
+  ~DS_Select() { Free() ; }
+
+  void Free()
+  {
+    int i ;
+    for (i = 0 ; i <= 1 ; ++i)
+    {
+      if (_S[i] != NULL)
+      {
+        free(_S[i]) ;
+        _S[i] = NULL ;
+      }
+      
+      if (_V[i] != NULL)
+      {
+        free(_V[i]) ;
+        _V[i] = NULL ; 
+      }
+      _rankV[i].Free() ;
+      _I[i].Free() ;
+    
+      if (_Vmini[i] != NULL)
+      {
+        free(_Vmini[i]) ;
+        _Vmini[i] = NULL ;
+      }
+      _rankVmini[i].Free() ;
+      _Imini[i].Free() ;
+      _Ilongmini[i].Free() ;
+      _precomputedShortMiniBlock[i].Free() ;
+    } 
+    _n = _b = 0 ;
+  }
+  
+  size_t GetSpace() { return _space + sizeof(*this); } 
+
+  // blockSize is the number of WORDs for each R 
+  // selectTypeSupport: bit coding for whether allocate memory to support select0 and select1
+  //  0-bit: select 0, 1-bit: selct1; so 3 means support both
+  void Init(int blockSize, const WORD *B, const size_t &n, int selectSpeed, int selectTypeSupport)
+  {
+    _speed = selectSpeed ;
+    this->_n = n ;
+    if (selectSpeed == 0 || selectTypeSupport == 0 || n <= 1)
+      return ;
+    size_t i, j ;
+    size_t wordCnt = Utils::BitsToWordBytes(n) / sizeof(WORD) ;
+    size_t *posBuffer = NULL;
+    _space = 0 ;
+    _b = blockSize ;
+
+    // Set the parameters based the desired _speed
+    if (_b <= (int)WORDBITS)
+    {
+      _b = WORDBITS * WORDBITS;
+      if (_speed == 2)
+        _b = WORDBITS * Utils::Log2Ceil(n) ; //* Utils::Log2Ceil( Utils::Log2Ceil(n) ) ;
+      if (_speed == 3)
+        _b = WORDBITS * WORDBITS ;
+      if (_speed == 4)
+        _b = WORDBITS * Utils::Log2Ceil(n) ;
+    }
+    
+    int logn = Utils::Log2Ceil(n) ;
+    //_longBlockLength = _b * Utils::Log2Ceil(n) * Utils::Log2Ceil(n) ; // Two sampled 1's are too far apart. It should be b*log^2 n
+    _longBlockLength = logn * logn * logn * logn ; // Two sampled 1's are too far apart. It should be log^4 n
+    if (_longBlockLength < (unsigned int)_b)
+      _longBlockLength = _b ;
+
+    _longMiniBlockLength = 0 ;
+    if (_speed == 2 || _speed == 3)
+    {
+      if (n >= (1<<30))
+        _precomputeb = 16 ;  // relate to precomputed select 
+      else
+        _precomputeb = 8 ;
+      _precomputebElem = _precomputeb ;
+      if (_speed == 3)
+      {
+        _minib = 2 * WORDBITS ;//logn * logn ;//CEIL(sqrt((double)b)) ;
+        _minib -= _b % _minib ;  
+        if (_minib < 3)
+        {
+          _minib = 3 ;
+          if (_b % 3)
+            _minib = 3 + _b%3 ;
+        }
+      }
+    }
+    else if (_speed == 4)
+    {
+      //_minib = sqrt(log n)
+      _minib = CEIL(pow((double)_b, 0.25)) ; // We make _minib depends on the choice of _b so it is easier to control the block size.
+      _minib -= _b % _minib ;  
+      if (_minib < 3)
+      {
+        _minib = 3 ;
+        if (_b % 3)
+          _minib = 3 + _b%3 ;
+      }
+      _longMiniBlockLength = DIV_CEIL(_minib * _minib, 2) ;
+      posBuffer = (size_t*)malloc(sizeof(*posBuffer) * (_b+1)) ;
+      _precomputeb = _longMiniBlockLength - 1 ;
+      _precomputebElem = _minib ;
+    }
+
+    _totalOneCnt = 0 ;
+    for (i = 0 ; i < wordCnt ; ++i)
+      _totalOneCnt += Utils::Popcount(B[i]) ;
+    
+    // Sample every other _b 1's (or 0's)  
+    size_t blockCnt[2] ;
+    blockCnt[0] = DIV_CEIL((n - _totalOneCnt), _b) + 1 ;
+    blockCnt[1] = DIV_CEIL(_totalOneCnt, _b) + 1 ;
+    for (i = 0 ; i <= 1 ; ++i)
+    {
+      if (!(selectTypeSupport & (1<<i)))
+        continue ;
+      _S[i] = (size_t *)calloc(blockCnt[i], sizeof(size_t)) ;
+      _space += sizeof(size_t) * blockCnt[i] ;
+      
+      _S[i][blockCnt[i] - 1] = _n ; 
+    }
+    
+    uint64_t sum[2] = {0, 0} ;
+    // _S[bit][x] mark a block of "bit" starting at index x
+    for (i = 0 ; i < wordCnt ; ++i)
+    {
+      WORD tmp = B[i] ;
+      for (j = 0 ; j < WORDBITS && (i * WORDBITS + j < n); ++j)
+      {
+        int bit = (tmp >>j) & 1ull ;
+        if (!(selectTypeSupport & (1<<bit)))
+          continue ;
+        if (sum[bit] % _b == 0)
+          _S[bit][sum[bit] / _b] = i * WORDBITS + j ;
+        ++sum[bit] ;
+      }
+    }
+    
+    // Sample more index for faster query
+    if (_speed >= 2)
+    {
+      int k ;
+      for (k = 0 ; k <= 1 ; ++k)
+      {
+        if (!(selectTypeSupport & (1<<k)))
+          continue ;
+        _V[k] = Utils::MallocByBits(blockCnt[k]) ;
+        _space += Utils::BitsToWords(blockCnt[k]) ;
+        _I[k].Malloc(Utils::Log2Ceil(n), DIV_CEIL(n, _longBlockLength)*_b) ;
+        
+        if (_speed >= 3)
+          _Imini[k].Malloc(Utils::Log2Ceil(_longBlockLength), blockCnt[k] * (_b/_minib)) ;
+        
+        if (_speed >= 4)
+        {
+          _Vmini[k] = Utils::MallocByBits(blockCnt[k] * (_b / _minib)) ;
+          // The long mini block can be almost as larage as long block length - 1
+          _Ilongmini[k].Malloc(Utils::Log2Ceil(_longBlockLength), DIV_CEIL(n, _longMiniBlockLength) * _minib) ;
+        }
+        
+        size_t newISize = 0 ;
+        size_t new_IminiSize = 0 ;
+        size_t new_IlongminiSize = 0 ;
+        for (i = 0 ; i < blockCnt[k] - 1 ; ++i)
+        {
+          if (_S[k][i + 1] - _S[k][i] >= _longBlockLength) 
+          {
+            Utils::BitSet(_V[k], i) ;
+            // The first element is already stored in S, so no need to store it
+            for (j = _S[k][i] + 1 ; j < _S[k][i + 1] ; ++j)
+            {
+              if (Utils::BitRead(B, j) == k)
+              {
+                _I[k].Write(newISize, j) ;
+                ++newISize ;
+              }
+            }
+            
+            if (_speed == 3) // For _speed 3, we still fill up I mini 
+                            // so we don't need to acces _rankV for efficiency.
+                            // Maybe I should do this to _speed 4 as well.
+            {
+              for (j = 0 ; j < (size_t)(_b / _minib) ; ++j)
+              {
+                _Imini[k].Write(new_IminiSize, 0) ;
+                ++new_IminiSize ;
+              }
+            }
+          }
+          else if (_speed >= 3) // short block case, we only need to process them when _speed==3
+          {
+            int minicnt = 1; 
+            size_t prevj = _S[k][i] ;
+            if (_speed >= 4)
+              posBuffer[0] = _S[k][i] ;
+            // j reaches the beginning of the next block so we can wrap up any unadded 
+            //   k's to the _miniblock. This handles both case that the last _miniblock in 
+            //   a block or the last _miniblock in the whole bit vector.
+            for (j = _S[k][i] + 1; j <= _S[k][i + 1] ; ++j)
+            {
+              int bit = 0 ;
+              if (j < n)
+                bit = Utils::BitRead(B, j) ;
+              if (bit == k || (j == _S[k][i + 1] && minicnt > 0))
+              {
+                if (minicnt == _minib || (j == _S[k][i + 1] && minicnt > 0))
+                {
+                  _Imini[k].Write(new_IminiSize, prevj - _S[k][i]) ;
+                  ++new_IminiSize ;
+
+                  if (_speed >= 4 && j - prevj >= _longMiniBlockLength)
+                  {
+                    int l ;
+                    Utils::BitSet(_Vmini[k], new_IminiSize - 1) ;
+                    for (l = 1 ; l < minicnt ; ++l) // we don't need to store the first element
+                    {
+                      _Ilongmini[k].Write(new_IlongminiSize, posBuffer[l] - _S[k][i]) ;
+                      ++new_IlongminiSize ;
+                    }
+                  }
+                  
+                  prevj = j ;
+                  minicnt = 0 ;
+                }
+                
+                if (bit == k)
+                {
+                  if (_speed >= 4)
+                    posBuffer[minicnt] = j ;
+                  ++minicnt ;
+                }
+              }
+            }
+          }
+        }
+        _I[k].Resize(newISize) ;
+        _space += _I[k].GetSpace() - sizeof(_I[k]) ;
+        
+        _rankV[k].Init(_V[k], blockCnt[k]) ;
+        _space += _rankV[k].GetSpace() - sizeof(_rankV[k]) ;
+        
+        if (_speed >= 3)
+        {
+          _Imini[k].Resize(new_IminiSize) ;
+          _space += _Imini[k].GetSpace() - sizeof(_Imini[k]) ;
+          //printf("%d %d. %d. %d %d\n", _Imini[k].GetSpace(), new_IminiSize, Utils::Log2Ceil(_longBlockLength), _minib, n/_minib) ;  
+        }
+
+        if (_speed >= 4)
+        {
+          _Vmini[k] = (WORD *)realloc(_Vmini[k], 
+              Utils::BitsToWordBytes(new_IminiSize)) ;
+          _VminiSize[k] = new_IminiSize ;
+          _space += Utils::BitsToWordBytes(new_IminiSize) ;
+          _rankVmini[k].Init(_Vmini[k], new_IminiSize) ;
+          _space += _rankVmini[k].GetSpace() - sizeof(_rankVmini[k]) ;
+
+          _Ilongmini[k].Resize(new_IlongminiSize) ;
+          _space += _Ilongmini[k].GetSpace() - sizeof(_Ilongmini[k]) ;
+        }
+      }
+    }
+    
+    if (0 && _speed >= 2) // Now we are using Rank9 and bit operator, so no need for the precomputed element
+    {
+      // The precomputed short _miniblocks 
+      unsigned int k ;
+      for (k = 0 ; k <= 1 ; ++k) 
+      {
+        if (!(selectTypeSupport & (1<<k)))
+          continue ;
+        size_t size = 1<<_precomputeb ;
+        _precomputedShortMiniBlock[k].Malloc(Utils::Log2Ceil(_precomputeb), size * _precomputebElem) ;
+        for (i = 0 ; i < size ; ++i) 
+        {
+          // Only consider the first _minib 1s 
+          j = 0 ;
+          size_t l ;
+          for (l = 0 ; l < (unsigned int)_precomputeb ; ++l)
+          {
+            if (((i >> l) & 1ull)==k)
+            {
+              _precomputedShortMiniBlock[k].Write(i * _precomputebElem + j, l) ;
+              ++j ;
+              if ((int)j >= _precomputebElem)
+                break ;
+            }
+          }
+        }
+        _space += _precomputedShortMiniBlock[k].GetSpace() - sizeof(_precomputedShortMiniBlock[k]) ;
+      }
+      if (_speed >= 4)
+        free(posBuffer) ;
+    }
+  }
+
+  // Return the index of the ith (1-index ith) 1.
+  size_t Query(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const
+  {
+    return GeneralQuery(i, rank, B, n, 1) ; 
+  }
+
+  // Return the index of the ith (1-index ith) 0.
+  size_t Query0(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const
+  {
+    return GeneralQuery(i, rank, B, n, 0) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _space) ; 
+    SAVE_VAR(fp, _n) ;
+    SAVE_VAR(fp, _speed) ;
+    
+    if (_speed == DS_SELECT_SPEED_NO || _n == 0)
+      return ;
+    
+    SAVE_VAR(fp, _longBlockLength) ; 
+    SAVE_VAR(fp, _minib);
+    SAVE_VAR(fp, _longMiniBlockLength) ;
+    SAVE_VAR(fp, _b) ;
+    SAVE_VAR(fp, _totalOneCnt) ;
+
+    size_t blockCnt[2] ;
+    blockCnt[0] = DIV_CEIL((_n - _totalOneCnt), _b) + 1 ;
+    blockCnt[1] = DIV_CEIL(_totalOneCnt, _b) + 1 ;
+    for (int i = 0 ; i <= 1 ; ++i)
+    {
+      size_t size = Utils::BitsToWords(blockCnt[i]) ;
+      fwrite(_S[i], sizeof(_S[i][0]), blockCnt[i], fp) ;
+      if (_speed >= 2)
+      {
+        fwrite(_V[i], sizeof(_V[i][0]), size, fp) ;
+        _rankV[i].Save(fp) ;
+        _I[i].Save(fp) ;
+      }
+      if (_speed >= 3)
+        _Imini[i].Save(fp) ;
+    }
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    LOAD_VAR(fp, _space) ; 
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _speed) ;
+    
+    if (_speed == DS_SELECT_SPEED_NO || _n == 0)
+      return ;
+    
+    LOAD_VAR(fp, _longBlockLength) ; 
+    LOAD_VAR(fp, _minib);
+    LOAD_VAR(fp, _longMiniBlockLength) ;
+    LOAD_VAR(fp, _b) ;
+    LOAD_VAR(fp, _totalOneCnt) ;
+
+    size_t blockCnt[2] ;
+    blockCnt[0] = DIV_CEIL((_n - _totalOneCnt), _b) + 1 ;
+    blockCnt[1] = DIV_CEIL(_totalOneCnt, _b) + 1 ;
+    for (int i = 0 ; i <= 1 ; ++i)
+    {
+      size_t size = Utils::BitsToWords(blockCnt[i]) ;
+      _S[i] = (size_t *)malloc(sizeof(_S[i][0]) * blockCnt[i]) ;
+      fread(_S[i], sizeof(_S[i][0]), blockCnt[i], fp) ;
+      
+      if (_speed >= 2)
+      {
+        _V[i] = Utils::MallocByBits(blockCnt[i]) ;
+        fread(_V[i], sizeof(_V[i][0]), size, fp) ;
+        _rankV[i].Load(fp) ;
+        _I[i].Load(fp) ;
+      }
+      
+      if (_speed >= 3)
+        _Imini[i].Load(fp) ;
+    }
+  }
+} ;
+}
+
+#endif 
diff --git a/compactds/DS_Select_Test.hpp b/compactds/DS_Select_Test.hpp
new file mode 100644
index 0000000..42303b5
--- /dev/null
+++ b/compactds/DS_Select_Test.hpp
@@ -0,0 +1,537 @@
+#ifndef _MOURISL_COMPACTDS_DS_SELECT_TEST
+#define _MOURISL_COMPACTDS_DS_SELECT_TEST
+
+#include "Utils.hpp"
+#include "DS_Rank.hpp"
+
+// The standalone data structe for select query on a plain bitvector with precomputed rank information 
+// n - bitvector length. m - number of 1s (or 0s for select0)
+// Speed 1: Time complexity: O(log n/m) [space: O(n/w)]
+// Speed 2: Time complexity: O(log log n) [space: O(n/log n)] 
+//          Most of the space are reuse the rank structure though, 
+//          so in practice the extra space is stil only O(n/w)
+// Seems speed 3, 4 does not work properly...
+// Speed 3: Time complexity: O(log log n) [space: O(n/log n)]
+//          inspired by the implementation in SDSL 
+// Speed 4: Time complexity: O(1)  [space: O(n loglog n / sqrt(log n) + sqrt(n))]
+//          The textbook O(n/log log n)-space algorithm has too large factor
+//          for precomputed short miniblocks. Not very pratical.
+//
+
+#define DS_SELECT_SPEED_NO 0
+#define DS_SELECT_SPEED_SAMPLED 1
+#define DS_SELECT_SPEED_RANKBINARY 2
+#define DS_SELECT_SPEED_DENSESAMPLE 3
+#define DS_SELECT_SPEED_CONSTANT 4
+
+namespace compactds {
+class DS_Select_Test
+{
+private:
+  size_t *S[2] ; // sampled position for 0's and 1's
+
+  // Data structures for long blocks
+  size_t longBlockLength ;
+  WORD *V[2] ; // indicator whether a S block is long (1) or short
+  DS_Rank rankV[2] ;
+  FixedSizeElemArray I[2] ; // precomputed index within long block
+
+  int precomputeb ; // the precomputed offsets within a word of size b
+  int precomputebElem ; // how many 1s we should consider for such word.
+
+  int minib ; // mini block size (the number of 1's) 
+  size_t longMiniBlockLength ; // long mini block length, for speed==3,4
+  WORD *Vmini[2] ; // indicator whether a S block is long mini or not
+  size_t VminiSize[2] ;
+  DS_Rank rankVmini[2] ;
+  FixedSizeElemArray Imini[2] ; // offset for the beginning of mini block 
+  FixedSizeElemArray Ilongmini[2] ; // offset for each element in long-mini block
+    
+  // Concatenated precomputed short mini block's S. We need concatenation, otherwise too 
+  // much overhead in the FixedSizeElemArray structure. 
+  // Even without FixedSizeElemArray, the pointers will take too much space.
+  FixedSizeElemArray precomputedShortMiniBlock[2] ; 
+
+  int b ; // block size (the number of 1's in a block) or sampling rate
+  size_t n ;
+  size_t totalOneCnt ; 
+   
+  size_t space ;
+  
+  int speed ; // 0: do not allocate; 1: slow, 2: medium, 3: medium-fast 4: fastest, constant time
+
+  // The select that handles both types (0 or 1).
+  size_t GeneralQuery(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n, int type) const
+  {
+    if (i < 1 || i > n)
+      return POSITIVE_INF ;
+    if (n == 1) // i must == 1 here
+      return 1 ;
+
+    size_t si = (i - 1) / b ;
+
+    size_t l, m, r ; // variables for binary search. They are coordinates on B 
+    l = S[type][si] ;
+    r = S[type][si + 1] ;
+    if ((i - 1) % b == 0) 
+      return l ;
+
+    if (speed == 1 || r - l < longBlockLength) // r-l is more efficient than V.access.
+    {
+      if (speed == 3)
+      {
+        // Adjust l, r using miniblocks
+        size_t oldl = l ;
+        l = oldl + Imini[type].Read((i - 1) / minib) ;
+        if ((i - 1) / minib + 1 < (unsigned int)(b / minib)) // only adjust r if it is not in the last miniblock in current block
+          r = oldl + Imini[type].Read((i - 1) / minib + 1) ;
+      }
+      if (speed <= 3)
+      {
+        --r ;
+
+        // Locate the R
+        uint64_t *rankR = rank.GetR() ;
+        const int rankBlockSize = rank.GetBlockSize() ; // this block size is with respect to WORD 
+        size_t rl, rr ; 
+        size_t tmp ;
+        rl = l / (rankBlockSize * WORDBITS) ;
+        rr = r / (rankBlockSize * WORDBITS) ;
+        while (rl <= rr)
+        {
+          m = (rl + rr) / 2 ;
+          tmp = rankR[m << 1] ;
+          if (type == 0)
+            tmp = m * rankBlockSize * WORDBITS - tmp ; 
+          
+          if (tmp < i)
+            rl = m + 1 ;
+          else
+            rr = m - 1 ; // rankR[0]==0 makes sure m>=1 in the process
+        }
+
+        // Locate the subR
+        size_t remaining ;
+        if (type == 1)
+          remaining = i - rankR[rr<<1] ;
+        else
+          remaining = i - (rr * (rankBlockSize * WORDBITS) - rankR[rr<<1]) ;
+        
+        int subrl, subrr ;
+        subrl = 0 ; // the first subr has offset 0, so we don't store them.
+        subrr = 6 ;
+        if (rr * rankBlockSize + 1 + subrr >= rank.GetWordCnt())
+          subrr = rank.GetWordCnt() - 2 - rr * rankBlockSize;
+        bool inFirstSubBlock = false ;
+        if (rank.GetWordCnt() <= 1 
+            || (type == 1 && rank.DecodeSubR(rr, 0) >= remaining)
+            || (type == 0 && WORDBITS - rank.DecodeSubR(rr, 0) >= remaining)
+            || subrr < subrl) // The case that the last block has only one subblock, which will not be allocated
+          inFirstSubBlock = true ;
+        
+        if ( !inFirstSubBlock )
+        {
+          size_t rword = rankR[2 * rr + 1] ;
+          while (subrl <= subrr)
+          {
+            m = (subrl + subrr) / 2 ;
+            tmp = (rword >> (m * 9ull)) & 0x1ff ;
+            //printf("%d %d. %llu\n", m, tmp, rword) ;
+            if (type == 0)
+              tmp = (m + 1) * WORDBITS - tmp ; // plus 1 here to incorporate the first sub block
+            if (tmp < remaining)
+              subrl = m + 1 ;
+            else
+              subrr = m - 1 ; // the in firstsubblock test makes sure this part won't under-flow 
+          }
+          
+          if (type == 1)
+            remaining -= (rword >> (subrr * 9)) & 0x1ff ;
+          else
+            remaining -= ((subrr + 1) * WORDBITS - ((rword >> (subrr * 9)) & 0x1ff)) ;
+        }
+        
+        // Processing the last WORD
+        size_t lastWi = 0 ; // index of the last word
+        WORD lastW = 0 ;
+        if (inFirstSubBlock)
+          lastWi = rr * rankBlockSize ;
+        else
+          lastWi = rr * rankBlockSize + subrr + 1 ; // here the rr is to compensate for the first subblock missed in every sub block
+        lastW = B[lastWi] ;
+        size_t j ;
+          
+        int sum = 0 ;
+        for (j = 0 ; j < WORDBITS ; j += precomputeb)
+        {
+          WORD x = (lastW >> j) & MASK(precomputeb) ;
+          int tmp = Utils::Popcount(x) ;
+          if (type == 0)
+            tmp = precomputeb - tmp ;
+          if (sum + tmp >= (int)remaining)
+          {
+            return lastWi * WORDBITS + j + precomputedShortMiniBlock[type].Read(x * precomputebElem + remaining - sum - 1) ;
+          }
+          sum += tmp ;
+        }
+        return POSITIVE_INF ; // should not reach here.
+      }
+      else // speed >= 4
+      {
+        size_t skippedMiniBlocksInLong = rankV[type].Query(si, V[type], n, 0) * (b/minib) ;
+        size_t iMini = (i - 1) / minib - skippedMiniBlocksInLong ; 
+        if ( Utils::BitRead(Vmini[type], iMini) )
+        {
+          // long mini block
+          if ((i-1) % minib == 0)
+          {
+            return l + Imini[type].Read(iMini) ;
+          }
+          else
+          {
+            size_t iLongMini = rankVmini[type].Query(iMini - skippedMiniBlocksInLong,
+                  Vmini[type], VminiSize[type], 0) ;
+            //printf("%d\n", iLongMini * (minib - 1) + (i - 1)%minib - 1) ;
+            /*printf("b=%d minib=%d. i=%d iMini=%d skippedMini=%d iLongMini=%d l=%d Imini[iMini]=%d x=%d Ilongmini[x]=%d. ret=%d\n", 
+                b, minib, i, iMini, skippedMiniBlocksInLong, 
+                iLongMini, l, Imini[type].Read(iMini), 
+                iLongMini * (minib - 1) + (i-1)%minib - 1, Ilongmini[type].Read(iLongMini * (minib - 1) + (i-1)%minib - 1), 
+                l + Ilongmini[type].Read(iLongMini * (minib - 1) + (i-1) % minib - 1)) ;*/ 
+            return l + Ilongmini[type].Read(iLongMini * (minib - 1) + (i-1) % minib - 1) ; 
+          }
+        }
+        else
+        {
+          // short mini block
+          size_t offset = l + Imini[type].Read(iMini) ;
+          WORD localw = Utils::BitsRead(B, offset, offset + longMiniBlockLength - 2) ;
+          return offset + precomputedShortMiniBlock[type].Read(localw * minib + (i-1)%minib) ;
+        }
+      }
+    }
+    else
+    {
+      // long block with sparse 1's
+      size_t iI = (rankV[type].Query(si, V[type], n) - 1) * (b - 1); // block index in I
+      //printf("long block %d %d %d %d %d. %d\n", i, b, si, Utils::BitRead(V[type], si), iI, I[type].Read(iI + (i - 1)%b - 1)) ;
+      return I[type].Read(iI + (i - 1)%b - 1) ;
+    }
+  }
+public:
+  DS_Select_Test() 
+  {
+    S[0] = S[1] = NULL ;
+    V[0] = V[1] = NULL ;
+    Vmini[0] = Vmini[1] = NULL ;
+    n = totalOneCnt = b = space = 0 ;
+  }
+
+  DS_Select_Test(int blockSize, const WORD *B, const int &n, int selectSpeed, int selectTypeSupport) 
+  {
+    Init(blockSize, B, n, selectSpeed, selectTypeSupport) ;
+  }
+
+  ~DS_Select_Test() { Free() ; }
+
+  void Free()
+  {
+    int i ;
+    for (i = 0 ; i <= 1 ; ++i)
+    {
+      if (S[i] != NULL)
+      {
+        free(S[i]) ;
+        S[i] = NULL ;
+      }
+      
+      if (V[i] != NULL)
+      {
+        free(V[i]) ;
+        V[i] = NULL ; 
+      }
+      rankV[i].Free() ;
+      I[i].Free() ;
+    
+      if (Vmini[i] != NULL)
+      {
+        free(Vmini[i]) ;
+        Vmini[i] = NULL ;
+      }
+      rankVmini[i].Free() ;
+      Imini[i].Free() ;
+      Ilongmini[i].Free() ;
+      precomputedShortMiniBlock[i].Free() ;
+    } 
+    n = b = 0 ;
+  }
+  
+  size_t GetSpace() { return space + sizeof(*this); } 
+
+  // blockSize is the number of WORDs for each R 
+  // selectTypeSupport: bit coding for whether allocate memory to support select0 and select1
+  //  0-bit: select 0, 1-bit: selct1; so 3 means support both
+  void Init(int blockSize, const WORD *B, const size_t &n, int selectSpeed, int selectTypeSupport)
+  {
+    if (selectSpeed == 0 || selectTypeSupport == 0 || n <= 1)
+      return ;
+    size_t i, j ;
+    size_t wordCnt = Utils::BitsToWordBytes(n) / sizeof(WORD) ;
+    size_t *posBuffer = NULL;
+    this->n = n ;
+    speed = selectSpeed ;
+    space = 0 ;
+    b = blockSize ;
+
+    // Set the parameters based the desired speed
+    if (b <= (int)WORDBITS)
+    {
+      b = WORDBITS * WORDBITS;
+      if (speed >= 2)
+        b = WORDBITS * Utils::Log2Ceil(n) ; //* Utils::Log2Ceil( Utils::Log2Ceil(n) ) ;
+      if (speed == 4)
+        b = WORDBITS * Utils::Log2Ceil(n) ;
+    }
+    
+    longBlockLength = b * Utils::Log2Ceil(n) * Utils::Log2Ceil(n) ; // Two sampled 1's are too far apart. It should be b*log^2 n
+    if (speed == 2 || speed == 3)
+    {
+      if (n >= (1<<30))
+        precomputeb = 16 ;  // relate to precomputed select 
+      else
+        precomputeb = 8 ;
+      precomputebElem = precomputeb ;
+      if (speed == 3)
+      {
+        minib = CEIL(sqrt((double)b)) ;
+        minib -= b % minib ;  
+        if (minib < 3)
+        {
+          minib = 3 ;
+          if (b % 3)
+            minib = 3 + b%3 ;
+        }
+      }
+    }
+    else if (speed == 4)
+    {
+      //minib = sqrt(log n)
+      minib = CEIL(pow((double)b, 0.25)) ; // We make minib depends on the choice of b so it is easier to control the block size.
+      minib -= b % minib ;  
+      if (minib < 3)
+      {
+        minib = 3 ;
+        if (b % 3)
+          minib = 3 + b%3 ;
+      }
+      longMiniBlockLength = DIV_CEIL(minib * minib, 2) ;
+      posBuffer = (size_t*)malloc(sizeof(*posBuffer) * (b+1)) ;
+      precomputeb = longMiniBlockLength - 1 ;
+      precomputebElem = minib ;
+    }
+
+    totalOneCnt = 0 ;
+    for (i = 0 ; i < wordCnt ; ++i)
+      totalOneCnt += Utils::Popcount(B[i]) ;
+    
+    // Sample every other b 1's (or 0's)  
+    size_t blockCnt[2] ;
+    blockCnt[0] = DIV_CEIL((n - totalOneCnt), b) + 1 ;
+    blockCnt[1] = DIV_CEIL(totalOneCnt, b) + 1 ;
+    for (i = 0 ; i <= 1 ; ++i)
+    {
+      if (!(selectTypeSupport & (1<<i)))
+        continue ;
+      S[i] = (size_t *)malloc(sizeof(size_t) * blockCnt[i]) ;
+      space += sizeof(size_t) * blockCnt[i] ;
+      
+      S[i][blockCnt[i] - 1] = n ; 
+    }
+    
+    uint64_t sum[2] = {0, 0} ;
+    // S[bit][x] mark a block of "bit" starting at index x
+    for (i = 0 ; i < wordCnt ; ++i)
+    {
+      WORD tmp = B[i] ;
+      for (j = 0 ; j < WORDBITS && (i * WORDBITS + j < n); ++j)
+      {
+        int bit = (tmp >>j) & 1ull ;
+        if (!(selectTypeSupport & (1<<bit)))
+          continue ;
+        if (sum[bit] % b == 0)
+          S[bit][sum[bit] / b] = i * WORDBITS + j ;
+        ++sum[bit] ;
+      }
+    }
+    
+    // Sample more index for faster query
+    if (speed >= 2)
+    {
+      int k ;
+      for (k = 0 ; k <= 1 ; ++k)
+      {
+        if (!(selectTypeSupport & (1<<k)))
+          continue ;
+        V[k] = Utils::MallocByBits(blockCnt[k]) ;
+        space += Utils::BitsToWords(blockCnt[k]) ;
+        I[k].Malloc(Utils::Log2Ceil(n), DIV_CEIL(n, longBlockLength)*b) ;
+        
+        if (speed >= 3)
+          Imini[k].Malloc(Utils::Log2Ceil(longBlockLength), blockCnt[k] * (b/minib)) ;
+
+        if (speed >= 4)
+        {
+          Vmini[k] = Utils::MallocByBits(blockCnt[k] * (b / minib)) ;
+          // The long mini block can be almost as larage as long block length - 1
+          Ilongmini[k].Malloc(Utils::Log2Ceil(longBlockLength), DIV_CEIL(n, longMiniBlockLength) * minib) ;
+        }
+        
+        size_t newISize = 0 ;
+        size_t newIminiSize = 0 ;
+        size_t newIlongminiSize = 0 ;
+        for (i = 0 ; i < blockCnt[k] - 1 ; ++i)
+        {
+          if (S[k][i + 1] - S[k][i] >= longBlockLength) 
+          {
+            Utils::BitSet(V[k], i) ;
+            // The first element is already stored in S, so no need to store it
+            for (j = S[k][i] + 1 ; j < S[k][i + 1] ; ++j)
+            {
+              if (Utils::BitRead(B, j) == k)
+              {
+                I[k].Write(newISize, j) ;
+                ++newISize ;
+              }
+            }
+            
+            if (speed == 3) // For speed 3, we still fill up I mini 
+                            // so we don't need to acces rankV for efficiency.
+                            // Maybe I should do this to speed 4 as well.
+            {
+              for (i = 0 ; i < (size_t)(b / minib) ; ++i)
+              {
+                Imini[k].Write(newIminiSize, 0) ;
+                ++newIminiSize ;
+              }
+            }
+          }
+          else if (speed >= 3) // short block case, we only need to process them when speed==3
+          {
+            int minicnt = 1; 
+            size_t prevj = S[k][i] ;
+            if (speed >= 4)
+              posBuffer[0] = S[k][i] ;
+            // j reaches the beginning of the next block so we can wrap up any unadded 
+            //   k's to the miniblock. This handles both case that the last miniblock in 
+            //   a block or the last miniblock in the whole bit vector.
+            for (j = S[k][i] + 1; j <= S[k][i + 1] ; ++j)
+            {
+              int bit = 0 ;
+              if (j < n)
+                bit = Utils::BitRead(B, j) ;
+              if (bit == k || (j == S[k][i + 1] && minicnt > 0))
+              {
+                if (minicnt == minib || (j == S[k][i + 1] && minicnt > 0))
+                {
+                  Imini[k].Write(newIminiSize, prevj - S[k][i]) ;
+                  ++newIminiSize ;
+
+                  if (speed >= 4 && j - prevj >= longMiniBlockLength)
+                  {
+                    int l ;
+                    Utils::BitSet(Vmini[k], newIminiSize - 1) ;
+                    for (l = 1 ; l < minicnt ; ++l) // we don't need to store the first element
+                    {
+                      Ilongmini[k].Write(newIlongminiSize, posBuffer[l] - S[k][i]) ;
+                      ++newIlongminiSize ;
+                    }
+                  }
+                  
+                  prevj = j ;
+                  minicnt = 0 ;
+                }
+                
+                if (bit == k)
+                {
+                  if (speed >= 4)
+                    posBuffer[minicnt] = j ;
+                  ++minicnt ;
+                }
+              }
+            }
+          }
+        }
+        I[k].Resize(newISize) ;
+        space += I[k].GetSpace() - sizeof(I[k]) ;
+        
+        rankV[k].Init(-1, V[k], blockCnt[k]) ;
+        space += rankV[k].GetSpace() - sizeof(rankV[k]) ;
+        
+        if (speed >= 3)
+        {
+          Imini[k].Resize(newIminiSize) ;
+          space += Imini[k].GetSpace() - sizeof(Imini[k]) ;
+          //printf("%d %d. %d. %d %d\n", Imini[k].GetSpace(), newIminiSize, Utils::Log2Ceil(longBlockLength), minib, n/minib) ;  
+        }
+
+        if (speed >= 4)
+        {
+          Vmini[k] = (WORD *)realloc(Vmini[k], 
+              Utils::BitsToWordBytes(newIminiSize)) ;
+          VminiSize[k] = newIminiSize ;
+          space += Utils::BitsToWordBytes(newIminiSize) ;
+          rankVmini[k].Init(-1, Vmini[k], newIminiSize) ;
+          space += rankVmini[k].GetSpace() - sizeof(rankVmini[k]) ;
+
+          Ilongmini[k].Resize(newIlongminiSize) ;
+          space += Ilongmini[k].GetSpace() - sizeof(Ilongmini[k]) ;
+        }
+      }
+    }
+    
+    if (speed >= 2)
+    {
+      // The precomputed short miniblocks 
+      unsigned int k ;
+      for (k = 0 ; k <= 1 ; ++k) 
+      {
+        if (!(selectTypeSupport & (1<<k)))
+          continue ;
+        size_t size = 1<<precomputeb ;
+        precomputedShortMiniBlock[k].Malloc(Utils::Log2Ceil(precomputeb), size * precomputebElem) ;
+        for (i = 0 ; i < size ; ++i) 
+        {
+          // Only consider the first minib 1s 
+          j = 0 ;
+          size_t l ;
+          for (l = 0 ; l < (unsigned int)precomputeb ; ++l)
+          {
+            if (((i >> l) & 1ull)==k)
+            {
+              precomputedShortMiniBlock[k].Write(i * precomputebElem + j, l) ;
+              ++j ;
+              if ((int)j >= precomputebElem)
+                break ;
+            }
+          }
+        }
+        space += precomputedShortMiniBlock[k].GetSpace() - sizeof(precomputedShortMiniBlock[k]) ;
+      }
+      if (speed >= 4)
+        free(posBuffer) ;
+    }
+  }
+
+  // Return the index of the ith (1-index ith) 1.
+  size_t Query(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const
+  {
+    return GeneralQuery(i, rank, B, n, 1) ; 
+  }
+
+  // Return the index of the ith (1-index ith) 0.
+  size_t Query0(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const
+  {
+    return GeneralQuery(i, rank, B, n, 0) ;
+  }
+} ;
+}
+#endif 
diff --git a/compactds/DifferenceCover.hpp b/compactds/DifferenceCover.hpp
new file mode 100644
index 0000000..7e1aa76
--- /dev/null
+++ b/compactds/DifferenceCover.hpp
@@ -0,0 +1,201 @@
+#ifndef _MOURISL_COMPACTDS_DIFFERENCECOVER
+#define _MOURISL_COMPACTDS_DIFFERENCECOVER
+
+#include "Utils.hpp"
+#include "SimpleVector.hpp"
+
+#include <map>
+#include <algorithm>
+
+// The class handling difference covers
+// Difference cover is a set of numbers D={a_0, ... a_{m-1}} in range [0, v)
+//  such that every i in [0,1) there is some a_j, a_k in D s.t. i=(a_j-a_k)%v.
+//  So the name comes from the differences of a set can cover all the element.
+// This class also handles when query elements larger than v (cyclic difference cover)
+namespace compactds {
+class DifferenceCover
+{
+private:
+  int v ; // period size  
+  int *dcs ; // DCs
+  int m ; // number of DCs 
+  std::map<int, int> dcMap ; // maybe replace this with a bit vector later 
+  int *precomputedD ; // precomputed information for Delta query
+
+  int GetB(int i, int r)
+  {
+    if (i < r)
+      return 1 ;
+    else if (i < r + 1)
+      return r + 1 ;
+    else if (i < 2 * r + 1)
+      return 2 * r + 1;
+    else if (i < 4 * r + 2)
+      return 4 * r + 3 ;
+    else if (i < 5 * r + 3)
+      return 2 * r + 2 ;
+    else if (i < 6 * r + 3)
+      return 1 ;
+    else
+      return 0 ; // ERROR
+  }
+public:
+  DifferenceCover() 
+  {
+    v = 4096 ;
+    dcs = NULL ;
+    m = 0 ;
+  }
+
+  ~DifferenceCover() 
+  {
+    if (dcs)
+    {
+      free(dcs) ;
+      free(precomputedD) ;
+    }
+  }
+
+  // The construction is based on Colbourn, Ling 2000
+  void Init(int v)
+  {
+    int i ;
+    if (v <= 13)
+      v = 14 ;
+
+    this->v = v ;
+    // Use the Colbourn, Ling method to find the cover 
+    int r = CEIL((-36 + sqrt(1296 - 96*(13 - v)))/48.0) ;
+    SimpleVector<int> rawdcs ; 
+    rawdcs.Reserve(6 * r + 4) ;
+    rawdcs.PushBack(0) ;
+    for (i = 1 ; i <= 6 * r + 3 ; ++i)
+      rawdcs.PushBack( rawdcs[i - 1] + GetB(i - 1, r)) ;
+    
+    // Put the finalized difference cover
+    m = 0 ;
+    for (i = 0 ; i < 6 * r + 4 ; ++i)
+    {
+      int dc = rawdcs[i] % v ;
+      if (dcMap.find(dc) == dcMap.end())
+      {
+        dcMap[dc] = m ;
+        ++m ;
+      }
+    }
+    
+    dcs = (int *)malloc(sizeof(dcs[0]) * m) ;
+    i = 0 ;
+    for (std::map<int, int>::iterator it = dcMap.begin() ; it != dcMap.end() ; ++it, ++i)
+    {
+      dcs[i] = it->first ;
+    }
+    
+    // Reorder them into increasing order
+    std::sort(dcs, dcs + m) ;
+    for (i = 0 ; i < m ; ++i)
+    {
+      dcMap[dcs[i]] = i ;
+    }
+    
+    // Precompute the look up table d for Delta query
+    // Lemma 4 in Fast Lightweight Suffix Array Construction and Checking 
+    // We can enumerate all the differences from D 
+    int j ;
+    precomputedD = (int *)malloc(sizeof(precomputedD[0]) * v) ;
+    memset(precomputedD, -1, sizeof(precomputedD[0]) * v) ;
+    precomputedD[0] = 0 ;
+    for (i = 0 ; i < m ; ++i)
+    {
+      for (j = 0 ; j < m ; ++j)
+      {
+        int d = dcs[j] - dcs[i] ;
+        if (d < 0)
+          d += v ;
+        precomputedD[d] = dcs[i] ; 
+      }
+    }
+  }
+
+  static size_t EstimateCoverSize(int v)
+  {
+    if (v <= 13)
+      return POSITIVE_INF ;
+    int r = CEIL((-36 + sqrt(1296 - 96*(13 - v)))/48.0) ;
+    return 6 * r + 4 ;
+  }
+
+  // Check whether an element is in diff-cover
+  bool IsInDC(size_t i)
+  {
+    if (dcMap.find(i%v) != dcMap.end())
+      return true ;
+    return false ;
+  }
+
+  int GetV()
+  {
+    return v ;
+  }
+
+  // Get the size of the DC that can cover [0, n)
+  size_t GetSize(size_t n)
+  {
+    int i ;
+    for (i = 0 ; i < m ; ++i)
+    {
+      if (dcs[i] >= (int)(n % v))
+        break ;
+    }
+    return n / v * m + i ;
+  }
+
+  // Return the difference cover in a list to cover [0, n)
+  size_t GetDiffCoverList(size_t n, size_t *dcList)
+  {
+    int i ;
+    size_t c ;
+    size_t cycleCnt = DIV_CEIL(n, v) ;
+    size_t ret = 0 ;
+    for (c = 0 ; c < cycleCnt ; ++c)
+    {
+      for (i = 0 ; i < m ; ++i)
+      {
+        size_t x = c * v + dcs[i] ;
+        if (x >= n)
+          break ;
+        dcList[ret] = x ;
+        ++ret ;
+      }
+    }
+    return ret ;
+  }
+
+  // Return the index when skipping the non-DC elements in the list
+  //  Assume i is in the difference cover.
+  size_t CompactIndex(size_t i)
+  {
+    return i / v * m + dcMap[i % v] ;
+    //int k = dcMap[i%v] ;
+    //return (n / v) * k + (k < coverCntInLastCycle ? k : coverCntInLastCycle) + i / v ; 
+  }
+
+  // Return the offset delta that (i+delta)%v and (j+delta)%v is in the difference cover
+  int Delta(size_t i, size_t j)
+  {
+    int ri = i % v ;
+    int rj = j % v ;
+    
+    int d = (rj - ri)%v ;
+    if (d < 0)
+      d += v ;
+    d = (precomputedD[d] - ri)%v ;
+    if (d < 0)
+      d += v ; 
+    
+    return d ;     
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/EliasCode.hpp b/compactds/EliasCode.hpp
new file mode 100644
index 0000000..48e8b27
--- /dev/null
+++ b/compactds/EliasCode.hpp
@@ -0,0 +1,74 @@
+#ifndef _MOURISL_COMPACTDS_ELIASCODE
+#define _MOURISL_COMPACTDS_ELIASCODE
+
+#include "Utils.hpp"
+
+namespace compactds {
+class EliasCode
+{
+public:
+  EliasCode() {}
+  ~EliasCode() {}
+  
+  // These function will output
+  // These methods can only encode positive numbers.
+  // The bits are also reversed so accessing them is easier.
+  // Even though the input value is 32-bit, the encoded bits can be greater than 32-bit.
+  static WORD Unary(int in, int &l) 
+  {
+    l = in ;
+    return 1ull << (in - 1);
+  }
+  
+  // Elias gamma
+  static WORD Gamma(int in, int &l)
+  {
+    int i ;
+    const int n = Utils::CountBits(in) ;
+    WORD ret = Unary(n, l) ;
+    // the rightmost bit of Unary(n) and the leftmost bit of in are both 1, so we only need to shift by once.
+    for (i = n - 2 ; i >= 0 ; --i, ++l)
+    {
+      ret |= (((in>>i)&1ull) << l) ;
+    }
+    //printf("%s: %d => %d %d %d\n", __func__, in, ret, n, l);
+    return ret ; 
+  }
+
+  // Elias delta
+  static WORD Delta(int in, int &l)
+  {
+    int i ;
+    int n = Utils::CountBits(in) ;
+    WORD ret = Gamma(n, l);
+    
+    for (i = n - 2 ; i >= 0 ; --i, ++l)
+      ret |= (((in>>i)&1) << l) ;
+    return ret ; // the leftmost bit of in is implicitly 1.
+  }
+  
+  // Read in one Gamma encoded word starting from W's ith bits 
+  // return: the value; l - # of processed bits
+  static int ReadOneGamma(WORD *W, size_t i, int &l)
+  {
+    size_t j, k ;
+    // Determine the length
+    for (j = i ; Utils::BitRead(W, j) == 0 ; ++j)
+      ;
+    l = j - i + 1 ;
+    int ret = 1 ;
+    for (k = j + 1 ; k < j + l ; ++k)
+      ret = (ret << 1) | Utils::BitRead(W, k) ;
+    l = k - i ;
+    return ret ;
+  }
+  
+  // TODO: implement this
+  static int ReadOneDelta(WORD *W, size_t i, int &out)
+  {
+    return 0 ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/FMBuilder.hpp b/compactds/FMBuilder.hpp
new file mode 100644
index 0000000..d595467
--- /dev/null
+++ b/compactds/FMBuilder.hpp
@@ -0,0 +1,504 @@
+#ifndef _MOURISL_COMPACTDS_FM_BUILDER
+#define _MOURISL_COMPACTDS_FM_BUILDER
+
+// Build BWT and other auxiliary datas from text T using blockwise suffix array sorting
+
+#include <utility>
+#include <map>
+
+#include <pthread.h> 
+
+#include "Utils.hpp"
+#include "SuffixArrayGenerator.hpp"
+
+namespace compactds {
+struct _FMBuilderParam
+{
+  size_t n ;
+
+  size_t saBlockSize ;
+  int saDcv ;  
+  size_t threadCnt ;
+  
+  int sampleRate ;
+  int sampleStrategy ; // on SA, on T or on the ends of BWT runs.
+  size_t sampleSize ;
+  size_t *sampledSA ;
+
+  int precomputeWidth ;
+  size_t precomputeSize ;
+  std::pair<size_t, size_t> *precomputedRange ; 
+  
+  bool printLog ;
+
+  size_t maxLcp ; // only consider LCP up to this point
+
+  std::map<size_t, size_t> selectedISA ;  
+  std::map<size_t, size_t> selectedSA ; // reverse selectedISA
+
+  WORD *semiLcpGreater ; // The LCP is between current suffix and its previous one
+  WORD *semiLcpEqual ;
+
+  size_t adjustedSA0 ; // specialized sampled SA.
+
+  FILE *dumpSaFp ; // dump SA to this file.
+
+  _FMBuilderParam()
+  {
+    sampleStrategy = 0 ;
+    saBlockSize = 1<<24 ;
+    saDcv = 4096 ;
+    sampleRate = 1<<5 ;
+    threadCnt = 1 ; // the number of threads for sorting. 
+    precomputeWidth = 10 ;
+    adjustedSA0 = 0 ;
+
+    printLog = true ;
+
+    maxLcp = 0 ;
+    dumpSaFp = NULL ; 
+    
+    // The memory for these arrays shall handled explicitly outside.
+    sampledSA = NULL ;
+    precomputedRange = NULL ;
+    semiLcpGreater = NULL ;
+    semiLcpEqual = NULL ;
+  }
+
+  // Use this free with caution,
+  //   as some of the pointer will be 
+  //   used in FMIndexAuxData.
+  // Use this only when generating BWT string.
+  void Free() 
+  {
+    if (sampledSA != NULL)
+      free(sampledSA) ;
+    if (precomputedRange != NULL)
+      free(precomputedRange) ;
+    if (semiLcpGreater != NULL)
+      free(semiLcpGreater) ;
+    if (semiLcpEqual != NULL)
+      free(semiLcpEqual) ;
+  }
+} ;
+
+struct _FMBuilderChunkThreadArg
+{
+  int tid ;
+  int threadCnt ;
+  
+  FixedSizeElemArray *T ;
+  size_t n ;
+  
+  SuffixArrayGenerator *saGenerator ;
+  size_t from, to ;
+  std::vector< std::vector<size_t> > pos ;
+} ;
+
+struct _FMBuilderSASortThreadArg 
+{
+  int tid ;
+  int threadCnt ;
+
+  FixedSizeElemArray *T ;
+  size_t n ;
+
+  SuffixArrayGenerator *saGenerator ;
+  size_t *sa ;
+  size_t saSize ;
+  
+  size_t accuChunkSize ;
+
+  WORD *semiLcpGreater ; // The LCP is between current suffix and its previous one
+  WORD *semiLcpEqual ;
+  size_t maxLcp ; // only consider LCP up to this point
+} ;
+
+class FMBuilder
+{
+private:
+  // Return the LCP up until the specified value bewteen T[i,...], T[j,...]
+  static size_t ComputeSemiLcp(FixedSizeElemArray &T, size_t n, size_t i, size_t j, size_t maxLCP)
+  {
+    size_t k ;
+    if (i >= n || j >= n || i < 0 || j < 0)
+      return 0 ;
+    else
+    {
+      for (k = 0 ; k < maxLCP && i + k < n && j + k < n ; ++k)
+      {
+        if (T.Read(i + k) != T.Read(j + k))
+          break ;
+      }
+      return k ;
+    }
+  }
+
+  static void *PosInChunk_Thread(void *arg)
+  {
+    struct _FMBuilderChunkThreadArg *pArg = (_FMBuilderChunkThreadArg *)arg ;
+    size_t segLen = DIV_CEIL(pArg->n, pArg->threadCnt) ;
+    size_t s = segLen * pArg->tid ;
+    size_t e = s + segLen - 1 ;
+    pArg->saGenerator->GetChunksPositions(*(pArg->T), pArg->n, 
+        pArg->from, pArg->to, s, e, pArg->pos) ;
+    pthread_exit(NULL) ;
+  }
+  
+  // Compare the semiLCP between T[sai...], and T[saj,...], write the result to semiLcp[biti]
+  static void SetSemiLcpBit(FixedSizeElemArray &T, size_t n, size_t sai, size_t saj, size_t biti, size_t maxLcp, WORD *semiLcpGreater, WORD *semiLcpEqual)
+  {
+    size_t l = 0 ;
+    l = ComputeSemiLcp(T, n, sai, saj, maxLcp + 1) ;
+    if (l > maxLcp)
+      Utils::BitSet(semiLcpGreater, biti) ;
+    else if (l == maxLcp)
+      Utils::BitSet(semiLcpEqual, biti) ;
+  }
+
+  static void *SortSA_Thread(void *arg)
+  {
+    struct _FMBuilderSASortThreadArg *pArg = (struct _FMBuilderSASortThreadArg *)arg ;
+    pArg->saGenerator->SortSuffixByPos(*(pArg->T),pArg->n, 
+        pArg->sa, pArg->saSize, pArg->sa ) ;
+    //printf("TEST %d\n",  saSortThreadArgs[0][0].sa[0]) ;
+    
+    if (pArg->maxLcp > 0)
+    {
+      size_t i ;
+      // The first element's LCP is between the last element from previous
+      // chunk, need to process outside.
+      for (i = 1 ; i < pArg->saSize ; ++i)
+        SetSemiLcpBit(*(pArg->T), pArg->n, pArg->sa[i], pArg->sa[i - 1], pArg->accuChunkSize + i, 
+            pArg->maxLcp, pArg->semiLcpGreater, pArg->semiLcpEqual) ;
+    }
+    
+    pthread_exit(NULL) ;
+  }
+
+public:
+  // Allocate and init the memorys for auxiliary data arrays in FM index
+  // chrbit: number of bits for each character
+  static void MallocAuxiliaryData(size_t chrbit, size_t n, struct _FMBuilderParam &param)
+  {
+    size_t i ;
+    param.n = n ;
+    
+    param.sampleSize = DIV_CEIL(n, param.sampleRate) ;
+    param.sampledSA = (size_t *)malloc(sizeof(size_t) * DIV_CEIL(n, param.sampleRate)) ;
+
+    size_t size = 1ull<<(chrbit * param.precomputeWidth) ;
+    param.precomputeSize = size ;  
+    param.precomputedRange = (std::pair<size_t, size_t> *)malloc(
+        sizeof(std::pair<size_t, size_t>) * size) ;
+    for (i = 0 ; i < size ; ++i)
+    {
+      param.precomputedRange[i].first = 0 ;
+      param.precomputedRange[i].second = 0 ;
+    }
+
+    if (param.maxLcp > 0)
+    {
+      param.semiLcpGreater = Utils::MallocByBits(n) ;
+      param.semiLcpEqual = Utils::MallocByBits(n) ;
+    }
+  }
+
+  // Determine the parameters for block size and difference cover size
+  //   based on memory requirement (bytes).
+  // Assume mem is quite large.
+  static void InferParametersGivenMemory(size_t n, int alphabetSize, size_t memory,
+      struct _FMBuilderParam &param)
+  {
+    size_t logBlockSize ;
+    size_t dcv ;
+    size_t alphabetBits = Utils::Log2Ceil(alphabetSize) ; 
+
+    size_t bestTime = POSITIVE_INF ;
+    size_t bestBlockSize = 0 ;
+    size_t bestDcv = 0 ;
+    
+    if (2 * n * alphabetBits / 8 > memory)
+      return ;
+    
+    memory -= 2 * n * alphabetBits / 8 ;
+    for (dcv = 512 ; dcv <= 8196 ; dcv *= 2)
+    {
+      size_t dcSize = DIV_CEIL(n, dcv) * DifferenceCover::EstimateCoverSize(dcv) ;
+      for (logBlockSize = 24 ; logBlockSize <= 50 ; ++logBlockSize)
+      {
+        size_t blockSize = 1ull<<logBlockSize ;
+        //if (blockSize >= n / param.threadCnt)
+        //  break ;
+
+        size_t space = (param.threadCnt * blockSize 
+            + dcSize + DIV_CEIL(n, param.sampleRate)
+            + (1ull<<(alphabetBits * param.precomputeWidth))*2
+            ) * WORDBYTES ;  
+        
+        if (space <= memory)
+        {
+          size_t iterations = DIV_CEIL(n, (param.threadCnt * blockSize)) ;
+          size_t time = dcSize * Utils::Log2Ceil(n) // sort difference cover 
+            + iterations * n * dcv // making cuts
+            + iterations * (blockSize * Utils::Log2Ceil(blockSize) + dcv * blockSize) ; // sort block
+          //printf("%lu(%lu) %lu %lu. %lu %lu. %lu\n", dcv, dcSize, blockSize, iterations, 
+          //    space, time, memory) ; 
+          if (time < bestTime)
+          {
+            bestBlockSize = blockSize ;
+            bestDcv = dcv ;
+            bestTime = time ;
+          }
+        }
+        else
+          break ;
+      }
+    }
+
+    if (bestDcv != 0)
+    {
+      param.saBlockSize = bestBlockSize ;
+      param.saDcv = bestDcv ;
+
+      if (param.printLog)
+      {
+        Utils::PrintLog("Estimated block size: %lu; dcv:%d",
+            param.saBlockSize, param.saDcv) ;
+      }
+    }
+  }
+
+  // T: text
+  // n: len(text)
+  // firstISA: ISA[0]
+  // Returned information is in BWT, firstISA, which are important in the F column. param holds all the other allocated array.
+  static void Build(FixedSizeElemArray &T, size_t n, int alphabetSize, 
+      FixedSizeElemArray &BWT, size_t &firstISA,
+      struct _FMBuilderParam &param)
+  {
+    size_t i, j, k ;
+    SuffixArrayGenerator saGenerator ;
+    MallocAuxiliaryData(Utils::Log2Ceil(alphabetSize), n, param) ; 
+    BWT.Malloc(Utils::Log2Ceil(alphabetSize), n) ;
+    if (param.printLog)
+      Utils::PrintLog("Generate difference cover and chunks.") ;
+    size_t cutCnt = saGenerator.Init(T, n, param.saBlockSize, param.saDcv, alphabetSize) ;
+    if (param.printLog)
+      Utils::PrintLog("Found %llu chunks.", cutCnt) ;
+    size_t bwtFilled = 0 ;
+   
+    pthread_t *threads = (pthread_t *)malloc(sizeof(*threads) * param.threadCnt) ;
+    struct _FMBuilderChunkThreadArg *chunkThreadArgs ;
+    struct _FMBuilderSASortThreadArg *saSortThreadArgs ; 
+    pthread_attr_t attr ;
+
+    pthread_attr_init( &attr ) ;
+    pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ) ;
+    size_t **sa ; // suffix array chunks
+    size_t *saChunkSize ; // actual size
+    size_t *saChunkCapacity ; // the memory capacity
+    
+    chunkThreadArgs = new struct _FMBuilderChunkThreadArg[param.threadCnt] ;
+    for (i = 0 ; i < param.threadCnt ; ++i)
+    {
+      chunkThreadArgs[i].tid = i ;
+      chunkThreadArgs[i].threadCnt = param.threadCnt ;
+      chunkThreadArgs[i].saGenerator = &saGenerator ;
+      chunkThreadArgs[i].T = &T ;
+      chunkThreadArgs[i].n = n ;
+    }
+
+    sa = (size_t **)malloc(sizeof(sa[0]) * param.threadCnt) ;
+    saChunkSize = (size_t *)malloc(sizeof(saChunkSize) * param.threadCnt) ;
+    saChunkCapacity = (size_t *)malloc(sizeof(saChunkCapacity) * param.threadCnt) ;
+    saSortThreadArgs = (struct _FMBuilderSASortThreadArg*)malloc(sizeof(struct _FMBuilderSASortThreadArg) * param.threadCnt) ; 
+    for (i = 0 ; i < param.threadCnt ; ++i)
+    {
+      sa[i] = NULL ;
+      saChunkSize[i] = 0 ;
+      saChunkCapacity[i] = 0 ;
+
+      saSortThreadArgs[i].tid = i ;
+      saSortThreadArgs[i].threadCnt = param.threadCnt ;
+      saSortThreadArgs[i].saGenerator = &saGenerator ;
+      saSortThreadArgs[i].T = &T ;
+      saSortThreadArgs[i].n = n ;
+
+      saSortThreadArgs[i].maxLcp = param.maxLcp ;
+      saSortThreadArgs[i].semiLcpGreater = param.semiLcpGreater ;
+      saSortThreadArgs[i].semiLcpEqual = param.semiLcpEqual ;
+    }
+
+    size_t lastSA = 0 ; // record the last SA from previous batch or chunk
+    size_t accuChunkSizeForSort = 0 ; // accumulated chunk size
+    i = 0 ;
+    
+    if (param.dumpSaFp)
+      fwrite(&n, sizeof(size_t), 1, param.dumpSaFp) ;
+
+    // Start the core iterations
+    for (i = 0 ; i < cutCnt ; i += param.threadCnt)
+    {
+      // Load positions for current batch
+      if (param.printLog)
+        Utils::PrintLog("Extract %d chunks. (%lu/%lu chunks finished)", param.threadCnt, i, cutCnt) ;
+      for (j = 0 ; j < param.threadCnt ; ++j)
+      {
+        chunkThreadArgs[j].from = i ;
+        chunkThreadArgs[j].to = (i + param.threadCnt - 1 < n ? i + param.threadCnt - 1 : n - 1) ;
+        pthread_create(&threads[j], &attr, PosInChunk_Thread, (void *)(chunkThreadArgs + j)) ;
+        //PosInChunk_Thread((void *)(chunkThreadArgs + j)) ;
+      }
+
+      if (param.printLog)
+        Utils::PrintLog("Wait for the chunk extraction to finish.") ;
+      for (j = 0 ; j < param.threadCnt ; ++j)
+        pthread_join(threads[j], NULL) ;
+
+      size_t chunkCnt = param.threadCnt ;
+      if (i + chunkCnt >= cutCnt)
+        chunkCnt = cutCnt - i ;
+
+      // concatenate the pos in the chunks
+      for (j = 0 ; j < chunkCnt ; ++j)
+      {
+        size_t totalSize = 0 ;
+        for (k = 0 ; k < param.threadCnt ; ++k)
+          totalSize += chunkThreadArgs[k].pos[j].size() ;
+        saChunkSize[j] = totalSize ; 
+        if (totalSize > saChunkCapacity[j])
+        {
+          free(sa[j]) ;
+          saChunkCapacity[j] = totalSize ;
+          sa[j] = (size_t *)malloc(sizeof(sa[j]) * totalSize) ;
+        }
+
+        totalSize = 0 ;
+        for (k = 0 ; k < param.threadCnt ; ++k)
+        {
+          memcpy(sa[j] + totalSize, chunkThreadArgs[k].pos[j].data(),
+              sizeof(sa[j][0]) * chunkThreadArgs[k].pos[j].size()) ;
+          totalSize += chunkThreadArgs[k].pos[j].size() ;
+          std::vector<size_t>().swap(chunkThreadArgs[k].pos[j]) ;
+        }
+      }
+
+      // Submit the batch of chunks to sorting 
+      if (param.printLog)
+        Utils::PrintLog("Submit %d chunks.", chunkCnt) ;
+      for (j = 0 ; j < chunkCnt ; ++j)
+      {
+        if (param.printLog)
+          Utils::PrintLog("Chunk %d elements: %llu", j, saChunkSize[j]) ;
+        saSortThreadArgs[j].sa = sa[j] ;
+        saSortThreadArgs[j].saSize = saChunkSize[j] ;
+        saSortThreadArgs[j].accuChunkSize = accuChunkSizeForSort ;
+        accuChunkSizeForSort += saChunkSize[j] ;
+        pthread_create(&threads[j], &attr, SortSA_Thread, (void *)(saSortThreadArgs + j)) ;
+        //SortSA_Thread( (void *)(saSortThreadArgs + j)) ;
+      }
+
+      // Wait for current batch to finish
+      if (param.printLog)
+        Utils::PrintLog("Wait for the chunk sort to finish.") ;
+      for (j = 0 ; j < chunkCnt ; ++j)
+        pthread_join(threads[j], NULL) ;
+
+      // Process the information from the chunks. 
+      if (param.printLog)
+        Utils::PrintLog("Postprocess %d chunks.", chunkCnt) ;
+      for (j = 0 ; j < chunkCnt ; ++j) 
+      {
+        size_t l ;
+        size_t size = saSortThreadArgs[j].saSize ;
+        size_t *saChunk = saSortThreadArgs[j].sa ;
+
+        // Fill FM string
+        //printf("%d %d %d %d\n", size, j, saSortThreadArgs[prevPosTag][j].pos->at(1),
+        //    saChunk[0]) ;
+        for (l = 0 ; l < size ; ++l)
+        {
+          if (saChunk[l] == 0)
+          {
+            firstISA = bwtFilled ;
+            BWT.Write(bwtFilled, T.Read(n - 1)) ;
+          }
+          else
+            BWT.Write(bwtFilled, T.Read( saChunk[l] - 1 ) ) ;
+
+          if (param.sampledSA != NULL && bwtFilled % param.sampleRate == 0)
+            param.sampledSA[bwtFilled / param.sampleRate] = saChunk[l] ;
+
+          if (param.precomputedRange != NULL)
+          {
+            int width = param.precomputeWidth ;
+            WORD w = 0 ;// word
+            if (saChunk[l] + width <= n)
+            {
+              w = T.PackRead(saChunk[l], width) ;
+              if (param.precomputedRange[w].second == 0)
+                param.precomputedRange[w].first = bwtFilled ;
+              ++param.precomputedRange[w].second ;
+            } 
+            /*else // ignore the case near the end of the string
+            {
+              w = T.PackRead(saChunk[l], n - saChunk[l]) ;
+              //size_t used = n - saChunk[l] ;
+              //w = (T.PackRead(0, w - used)) << used | w
+            }*/
+
+          }
+
+          if (param.selectedISA.size() != 0 )
+          {
+            if (param.selectedISA.find(saChunk[l]) != param.selectedISA.end()) 
+              param.selectedISA[saChunk[l]] = bwtFilled ;
+          }
+
+          ++bwtFilled ;
+        }
+
+        if (param.maxLcp > 0)
+        {
+          size_t offseti = bwtFilled - size ; // equiavlent to accuChunkSize
+          if (i > 0 || j > 0) // ignore the very first SA in the whole array
+            SetSemiLcpBit(T, n, saChunk[0], lastSA, offseti, param.maxLcp, 
+                param.semiLcpGreater, param.semiLcpEqual) ;                 
+        }
+
+        // the last element from previous chunk. 
+        lastSA = saChunk[size - 1] ;
+        
+        if (param.dumpSaFp)
+          fwrite(saChunk, sizeof(saChunk[0]), size, param.dumpSaFp) ;
+      }
+    } // end of the main while loop for populating BWTs
+    
+    // Fill in the selectedSA
+    for (std::map<size_t, size_t>::iterator iter = param.selectedISA.begin() ;
+        iter != param.selectedISA.end(); ++iter)
+    {
+      param.selectedSA[iter->second] = iter->first ;
+    }
+    std::map<size_t, size_t>().swap(param.selectedISA) ; // ISA will not be useful
+
+    free(threads) ;
+    pthread_attr_destroy(&attr) ;
+    delete[] chunkThreadArgs ;
+    for (j = 0 ; j < param.threadCnt ; ++j)
+    {
+      if (sa[j] != NULL)
+      {
+        free(sa[j]) ;
+      }
+    }
+    free(sa) ;
+    free(saChunkSize) ;
+    free(saChunkCapacity) ;
+    free(saSortThreadArgs) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/FMIndex.hpp b/compactds/FMIndex.hpp
new file mode 100644
index 0000000..b306dae
--- /dev/null
+++ b/compactds/FMIndex.hpp
@@ -0,0 +1,491 @@
+#ifndef _MOURISL_COMPACTDS_FM_INDEX
+#define _MOURISL_COMPACTDS_FM_INDEX
+
+#include <stdio.h>
+
+#include "Alphabet.hpp"
+#include "FixedSizeElemArray.hpp"
+#include "FMBuilder.hpp"
+
+// Auxiliary data, other than the BWT and F (alphabet partial sum), for FM index
+// Should be directly initalized through FMBuilderParam, simplifies the parameter passing
+namespace compactds {
+struct _FMIndexAuxData 
+{
+  size_t n ; // the length of the text
+
+  int sampleStrategy ;
+  int sampleRate ;
+  size_t sampleSize ;
+  FixedSizeElemArray sampledSA ;
+
+  // precomputedRange: the BWT range for a prefix of size param.precomputeWidth
+  //                  The pair format is (the start position, and the length of the range).
+  //                  The advantage is that we can easily tell whether a range is empty.
+  size_t precomputeWidth ;
+  size_t precomputeSize ;
+  std::pair<size_t, size_t> *precomputedRange ;
+
+  size_t maxLcp ; // only consider LCP up to this point
+  WORD *semiLcpGreater ; // The LCP is between current suffix and its previous one
+  WORD *semiLcpEqual ;
+
+  size_t adjustedSA0 ;
+  std::map<size_t, size_t> selectedSA ; // SAs for speical purposes: e.g. boundary of genomes 
+  WORD *selectedSAFilter ; // Quick test whether a SA could be selectedSA 
+  int selectedSAFilterSampleRate ;
+
+  bool printLog ;
+
+  _FMIndexAuxData()
+  {
+    sampleStrategy = 0 ;
+    sampleRate = 0 ;
+    sampleSize = 0 ;
+    precomputeWidth = 0 ;
+    precomputeSize = 0 ;
+    precomputedRange = NULL ;
+    
+    maxLcp = 0 ;
+    semiLcpGreater = NULL ;
+    semiLcpEqual = NULL ;
+
+    adjustedSA0 = 0 ;
+    selectedSAFilter = NULL ;
+    selectedSAFilterSampleRate = 1024 ;
+
+    printLog = true ;
+  }
+
+  ~_FMIndexAuxData()
+  {
+    // NOTE: has to be explicitly called through Free to release the memory.
+  } ;
+
+  void Free()
+  {
+    sampledSA.Free() ;
+    
+    if (precomputedRange)
+    {
+      free(precomputedRange) ;
+      precomputedRange = NULL ;
+    }
+
+    if (semiLcpGreater)
+    {
+      free(semiLcpGreater) ;
+      free(semiLcpEqual) ;
+      semiLcpGreater = NULL ;
+      semiLcpEqual = NULL ;
+    }
+
+    if (selectedSA.size() > 0)
+    {
+      selectedSA.clear() ;
+      free(selectedSAFilter) ;
+    }
+  }
+
+  void Save(FILE *fp) 
+  {
+    SAVE_VAR(fp, n) ;
+    SAVE_VAR(fp, sampleStrategy) ;
+    SAVE_VAR(fp, sampleRate) ;
+    SAVE_VAR(fp, sampleSize) ;
+    SAVE_VAR(fp, precomputeWidth) ;
+    SAVE_VAR(fp, precomputeSize) ;
+    SAVE_VAR(fp, adjustedSA0) ;
+
+    sampledSA.Save(fp) ;
+    SAVE_ARR(fp, precomputedRange, precomputeSize) ;
+
+    SAVE_VAR(fp, maxLcp) ;
+    if (maxLcp > 0)
+    {
+      fwrite(semiLcpGreater, sizeof(*semiLcpGreater), Utils::BitsToWords(n), fp) ;
+      fwrite(semiLcpEqual, sizeof(*semiLcpEqual), Utils::BitsToWords(n), fp) ;
+    }
+
+    // For speical SAs
+    size_t tmpSize = selectedSA.size() ;
+    SAVE_VAR(fp, tmpSize) ;
+    SAVE_VAR(fp, selectedSAFilterSampleRate) ;
+    for (std::map<size_t, size_t>::iterator iter = selectedSA.begin() ;
+        iter != selectedSA.end() ; ++iter)
+    {
+      size_t pair[2] = {iter->first, iter->second} ;
+      fwrite(pair, sizeof(size_t), 2, fp) ;
+    }
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    size_t i ;
+
+    LOAD_VAR(fp, n) ;
+    LOAD_VAR(fp, sampleStrategy) ;
+    LOAD_VAR(fp, sampleRate) ;
+    LOAD_VAR(fp, sampleSize) ;
+    LOAD_VAR(fp, precomputeWidth) ;
+    LOAD_VAR(fp, precomputeSize) ;
+    LOAD_VAR(fp, adjustedSA0) ;
+
+    sampledSA.Load(fp) ; 
+    precomputedRange = (std::pair<size_t, size_t> *)malloc(
+        sizeof(std::pair<size_t, size_t>) * precomputeSize) ;
+    LOAD_ARR(fp, precomputedRange, precomputeSize) ;
+
+    LOAD_VAR(fp, maxLcp) ;
+    if (maxLcp > 0)
+    {
+      semiLcpGreater = Utils::MallocByBits(n) ;
+      semiLcpEqual = Utils::MallocByBits(n) ;
+      fread(semiLcpGreater, sizeof(*semiLcpGreater), Utils::BitsToWords(n), fp) ;
+      fread(semiLcpEqual, sizeof(*semiLcpEqual), Utils::BitsToWords(n), fp) ;
+    }
+
+    size_t tmpSize = 0 ;
+    LOAD_VAR(fp, tmpSize) ;
+    LOAD_VAR(fp, selectedSAFilterSampleRate) ;
+    if (tmpSize > 0)
+    {
+      selectedSAFilter = Utils::MallocByBits(DIV_CEIL(n, selectedSAFilterSampleRate)) ; 
+      for (i = 0 ; i < tmpSize ; ++i)
+      {
+        size_t pair[2] ;
+        fread(pair, sizeof(size_t), 2, fp) ;
+        selectedSA[pair[0]] = pair[1] ;
+        Utils::BitSet(selectedSAFilter, pair[0] / selectedSAFilterSampleRate) ;
+      }
+    }
+  }
+} ;
+
+template <class SeqClass>
+class FMIndex
+{
+private:
+  SeqClass _BWT ;
+  size_t _n ;
+  Alphabet _alphabets ; // May handle more complex mapping, e.g. Huffman coding
+  Alphabet _plainAlphabetCoder ; // for plain mapping, important for partial sum access
+  size_t *_plainAlphabetPartialSum ;
+  size_t _plainAlphabetBits ; // Needed for coding index accessing precomputedRange
+  size_t _firstISA ; // ISA[0]
+  ALPHABET _lastChr ; // last character in the original text 
+  
+  // @return: whether SA[i] information is stored
+  // the SA information is returned through the reference sa 
+  bool GetSampledSA(size_t i, size_t &sa)
+  {
+    if (i == _firstISA)
+    {
+      sa = _auxData.adjustedSA0 ;
+      return true ;
+    }
+    else if (i % _auxData.sampleRate == 0)
+    {
+      sa = _auxData.sampledSA[i / _auxData.sampleRate] ;
+      return true ;
+    }
+    else if (_auxData.selectedSAFilter)
+    {
+      if (Utils::BitRead(_auxData.selectedSAFilter,  i / _auxData.selectedSAFilterSampleRate)
+          && (_auxData.selectedSA.find(i) != _auxData.selectedSA.end()))
+      {
+        sa = _auxData.selectedSA[i] ;
+        return true ;
+      }
+    }
+
+    return false ;
+  }
+public:
+  struct _FMIndexAuxData _auxData ; // the data used for locate operation
+  
+  FMIndex() 
+  {
+    _n = 0 ;
+  }
+  
+  ~FMIndex() 
+  {
+    Free() ;
+  }
+  
+  void SetAlphabetCode(const Alphabet &a)
+  {
+    _alphabets = a ;  
+  }
+
+  void Free()
+  {
+    if (_n > 0)
+    {
+      _n = 0 ;
+      free(_plainAlphabetPartialSum) ;
+      _auxData.Free() ;
+    }
+  }
+
+  void InitAuxData(struct _FMBuilderParam &builderParam)
+  {
+    _auxData.n = builderParam.n ;
+  
+    _auxData.sampleRate = builderParam.sampleRate ;
+    _auxData.sampleSize = builderParam.sampleSize ;
+    _auxData.sampleStrategy = builderParam.sampleStrategy ;
+    //_auxData.sampledSA = builderParam.sampledSA ;
+    _auxData.sampledSA.InitFromArray(0, builderParam.sampledSA, _auxData.sampleSize) ;
+    free(builderParam.sampledSA) ;
+    
+    _auxData.precomputeWidth = builderParam.precomputeWidth ;
+    _auxData.precomputeSize = builderParam.precomputeSize ;
+    _auxData.precomputedRange = builderParam.precomputedRange ;
+
+    _auxData.maxLcp = builderParam.maxLcp ;
+    _auxData.semiLcpGreater = builderParam.semiLcpGreater ; 
+    _auxData.semiLcpEqual = builderParam.semiLcpEqual ;
+
+    _auxData.adjustedSA0 = builderParam.adjustedSA0 ;
+
+    if (builderParam.selectedSA.size() > 0)
+    {
+      _auxData.selectedSAFilter = Utils::MallocByBits(DIV_CEIL(_auxData.n, 
+            _auxData.selectedSAFilterSampleRate)) ; 
+      
+      _auxData.selectedSA = builderParam.selectedSA ;
+      for (std::map<size_t, size_t>::iterator iter = _auxData.selectedSA.begin() ;
+                    iter != _auxData.selectedSA.end(); ++iter)
+      {
+        Utils::BitSet(_auxData.selectedSAFilter, 
+            iter->first / _auxData.selectedSAFilterSampleRate) ;
+      }
+    }
+  }
+
+  void Init(FixedSizeElemArray &BWT, size_t n,
+    size_t firstISA, struct _FMBuilderParam& builderParam, 
+    const ALPHABET *alphabetMapping, int alphabetSize) 
+  {
+    size_t i ;
+    
+    _plainAlphabetCoder.InitFromList(alphabetMapping, alphabetSize) ; // The input BWT string should be also plain coded in the same fashion
+    _plainAlphabetBits = Utils::Log2Ceil(alphabetSize) ;
+    
+    if (_alphabets.GetSize() == 0)
+      _alphabets.InitFromList(alphabetMapping, alphabetSize) ;
+    
+    // Auxiliary data structures
+    _n = n ;
+    _firstISA = firstISA ;
+    _lastChr = alphabetMapping[ BWT.Read(firstISA) ] ;
+    InitAuxData(builderParam) ;
+
+    // L list
+    _BWT.SetAlphabet(_alphabets) ;
+    _BWT.Init(BWT, n, alphabetMapping) ;
+    if (_auxData.printLog)
+      _BWT.PrintStats() ;
+
+    // F list
+    _plainAlphabetPartialSum = (size_t *)calloc(alphabetSize + 1,
+        sizeof(*_plainAlphabetPartialSum)) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      ++_plainAlphabetPartialSum[BWT.Read(i)] ; 
+    }
+    for (i = 1 ; i < alphabetSize ; ++i)
+      _plainAlphabetPartialSum[i] += _plainAlphabetPartialSum[i - 1] ;
+    for (i = alphabetSize ; i >= 1 ; --i)
+      _plainAlphabetPartialSum[i] = _plainAlphabetPartialSum[i - 1] ;
+    _plainAlphabetPartialSum[0] = 0 ;
+  }
+
+  size_t Rank(ALPHABET c, size_t p, int inclusive = 1)
+  {
+    size_t ret = _BWT.Rank(c, p, inclusive) ;
+    // Since we do not use $, the last character in the original string 
+    //   will be moved to the _firstISA instead of the first position
+    //   We need to move this back
+    // Potential future refactoring: appending an A to the end of the string
+    if (c == _lastChr && (p < _firstISA || (!inclusive && p == _firstISA)))
+      ++ret ;
+    return ret ;
+  }
+
+  void BackwardExtend(ALPHABET c, size_t sp, size_t ep, 
+      size_t &nextSp, size_t &nextEp)
+  {
+    size_t offset = _plainAlphabetPartialSum[ _plainAlphabetCoder.Encode(c) ] ; 
+    //printf("%c: %d %d %d. %d %d\n", c, offset, sp, ep, _BWT.Rank(c, sp, 0),
+    //    _BWT.Rank(c, ep)) ;
+    // Need minus 1 here because the return of Rank is 1-based.
+    nextSp = offset + Rank(c, sp, /*inclusive=*/0) + 1 - 1 ;
+    
+    // TODO: Fix a potential issue of underflow.
+    //       Now it is handled by out side
+    if (sp != ep)
+      nextEp = offset + Rank(c, ep) - 1 ;
+    else
+      nextEp = nextSp + ((_BWT.Access(ep) == c) ? 0 : -1) ;
+  }
+
+  // This one is essentially LF mapping 
+  size_t BackwardExtend(ALPHABET c, size_t p)
+  {
+    size_t offset = _plainAlphabetPartialSum[ _plainAlphabetCoder.Encode(c) ] ;
+    return offset + Rank(c, p) - 1 ;
+  }
+
+  // m - length of s
+  // Return the [sp, ep] through the option, and the length of matched prefix in size_t
+  size_t BackwardSearch(char *s, size_t m, size_t &sp, size_t &ep)
+  {
+    size_t i ;
+    if (m < _auxData.precomputeWidth)
+      return 0 ;
+
+    if (_auxData.precomputeWidth > 0)
+    {
+      WORD initW = 0 ;
+      for (i = 0 ; i < _auxData.precomputeWidth ; ++i)
+      {
+        if (!_alphabets.IsIn(s[m - 1 - i]))
+        {
+          sp = 1 ;
+          ep = 0 ;
+          return i ;
+        }
+        initW = (initW << _plainAlphabetBits) | (_plainAlphabetCoder.Encode(s[m - 1 - i])) ;
+      }
+      
+      if (_auxData.precomputedRange[initW].second == 0)
+      {
+        sp = 1 ;
+        ep = 0 ;
+        return _auxData.precomputeWidth - 1 ;
+      }
+      sp = _auxData.precomputedRange[initW].first ;
+      ep = sp + _auxData.precomputedRange[initW].second - 1 ;
+    }
+    else
+    {
+      sp = 0 ;
+      ep = _n - 1 ;
+    }
+
+    size_t l = _auxData.precomputeWidth ; 
+    size_t nextSp = sp ;
+    size_t nextEp = ep ;
+    while (l < m)
+    {
+      if (!_alphabets.IsIn(s[m - 1 - l]))
+        break ;
+      BackwardExtend(s[m - 1 - l], sp, ep, nextSp, nextEp) ;
+      if ( nextSp > nextEp || nextEp > _n)
+        break ;
+      sp = nextSp ;
+      ep = nextEp ;
+      ++l ;
+    }
+    return l ;
+  }
+
+  // @return: the value of the sampled SA for BWT[i]
+  //          l is the offset between 
+  size_t BackwardToSampledSA(size_t i, size_t &l)
+  {
+    l = 0 ;
+    size_t ret = 0 ;
+    while (!GetSampledSA(i, ret))
+    {
+      i = BackwardExtend( _BWT.Access(i), i) ;
+      ++l ;
+    }
+    return ret ;
+  }
+
+  // return ISA[n - 1]
+  size_t GetLastISA()
+  {
+    return _plainAlphabetPartialSum[ _plainAlphabetCoder.Encode(_lastChr) ] ;
+  }
+
+  // Calculate the values for SA[sp..ep]
+  void LocateRange(size_t sp, size_t ep, bool withOffset, std::vector<size_t> &locatedSA)
+  {
+    size_t i ;
+    locatedSA.clear() ;
+    for (i = sp ; i <= ep ; ++i)
+    {
+      size_t l ;
+      size_t sa = BackwardToSampledSA(i, l) ;
+      if (withOffset)
+        locatedSA.push_back(sa + l) ;
+      else
+        locatedSA.push_back(sa) ;
+    }
+  }
+
+  size_t GetSize()
+  {
+    return _n ;
+  }
+
+  size_t GetAlphabetSize()
+  {
+    return _alphabets.GetSize() ;
+  }
+
+  void PrintSpace()
+  {
+    Utils::PrintLog("FM-index space usage (bytes):") ;
+    Utils::PrintLog("BWT: %llu", _BWT.GetSpace()) ;
+    Utils::PrintLog("sampledSA: %llu", _auxData.sampledSA.GetSpace()) ;
+    Utils::PrintLog("precomputedRange: %llu", _auxData.precomputeSize * sizeof(*_auxData.precomputedRange)) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _n) ;
+    SAVE_VAR(fp, _plainAlphabetBits) ;
+    SAVE_VAR(fp, _firstISA) ;
+    SAVE_VAR(fp, _lastChr) ;
+
+    _BWT.Save(fp) ;
+
+    _alphabets.Save(fp) ;
+    _plainAlphabetCoder.Save(fp) ;
+    size_t alphabetSize = _plainAlphabetCoder.GetSize() ;
+    SAVE_ARR(fp, _plainAlphabetPartialSum, alphabetSize + 1) ;
+    
+    _auxData.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _plainAlphabetBits) ;
+    LOAD_VAR(fp, _firstISA) ;
+    LOAD_VAR(fp, _lastChr) ;
+
+    _BWT.Load(fp) ;
+
+    _alphabets.Load(fp) ;
+    _plainAlphabetCoder.Load(fp) ;
+    size_t alphabetSize = _plainAlphabetCoder.GetSize() ;
+    _plainAlphabetPartialSum = (size_t *)calloc(alphabetSize + 1,
+        sizeof(*_plainAlphabetPartialSum)) ;
+    LOAD_ARR(fp, _plainAlphabetPartialSum, alphabetSize + 1) ;
+
+    _auxData.Load(fp) ; 
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/FixedSizeElemArray.hpp b/compactds/FixedSizeElemArray.hpp
new file mode 100644
index 0000000..4591774
--- /dev/null
+++ b/compactds/FixedSizeElemArray.hpp
@@ -0,0 +1,322 @@
+#ifndef _MOURISL_COMPACTDS_FIXEDSIZEELEM_ARRAY
+#define _MOURISL_COMPACTDS_FIXEDSIZEELEM_ARRAY
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <vector>
+
+#include "Utils.hpp"
+
+/*
+ * The class for the array where each element is of fixed size
+ * We use a word size w = 64bit to maximize the chance of within word access
+ * Externally the index is continuous, but interally they are segmented by the word as the right of Fig 3.3 
+ */
+
+namespace compactds {
+class FixedSizeElemArray
+{
+private:
+  WORD *_W ;
+  size_t _size ; // memory size in word 
+  int _l ;
+  size_t _n ;
+public:
+  FixedSizeElemArray() 
+  {
+    _W = NULL ;
+    _size = 0 ;
+    _n = 0 ;
+    _l = 0 ;
+  }
+
+  ~FixedSizeElemArray() 
+  {
+    Free() ;
+  }
+
+  // Allocate the memory for _n elements, where each elements takes _l bits 
+  void Malloc(int l, size_t n)
+  {
+    Free() ;
+    this->_n = n ;
+    this->_l = l ;
+    _size = Utils::BitsToWords(l * n) ;
+    _W = Utils::MallocByBits(l * n) ;
+  }
+
+  // _l - number of bits for each element. <=0: automatically decide
+  // in - input array
+  // n - the length of input array
+  void InitFromArray(int l, const unsigned int *in, const size_t &n)
+  {
+    size_t i ;
+    if (l <= 0)
+    {
+      // We determine the best fixed size
+      l = 1 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        int bitCounts = Utils::CountBits(in[i]) ;
+        if (bitCounts > l)
+          l = bitCounts ;
+      }
+    }
+
+    Malloc(l, n) ;
+    for (i = 0 ; i < n ; ++i)
+      Write(i, in[i]) ;
+  }
+  
+  void InitFromArray(int l, const size_t *in, const size_t &n)
+  {
+    size_t i ;
+    if (l <= 0)
+    {
+      // We determine the best fixed size
+      l = 1 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        int bitCounts = Utils::CountBits(in[i]) ;
+        if (bitCounts > l)
+          l = bitCounts ;
+      }
+    }
+
+    Malloc(l, n) ;
+    for (i = 0 ; i < n ; ++i)
+      Write(i, in[i]) ;
+  }
+
+  void Free()
+  {
+    if (_W != NULL)
+      free(_W) ;
+    _W = NULL ;
+    _n = _l = 0 ;
+  }
+  
+  // Get the i-th element
+  uint64_t Read(size_t i) const 
+  {
+    return Utils::BitsRead(_W, i * _l, (i + 1)* _l - 1) ;
+  }
+
+  uint64_t operator[](size_t i) const
+  {
+    return Read(i) ;
+  }
+
+  void Write(size_t i, int x)
+  {
+    Utils::BitsWrite(_W, i * _l, (i + 1) * _l - 1, x) ;
+  }
+
+  /*uint64_t Read64(size_t i) const
+  {
+    return Utils::BitsRead(_W, i * _l, (i + 1)* _l - 1) ;
+  }*/
+
+  void Write64(size_t i, uint64_t x)
+  {
+    Utils::BitsWrite(_W, i * _l, (i + 1) * _l - 1, x) ;
+  }
+
+  size_t GetSpace() const
+  {
+    return sizeof(_W[0]) * _size + sizeof(*this) ; 
+  }
+
+  int GetElemLength() const
+  {
+    return _l ;
+  }
+
+  void SetElemLength(int l)
+  {
+    _l = l ;
+  }
+  
+  size_t GetSize() const
+  {
+    return _n ;
+  }
+
+  // Assume we don't need to change the memory size
+  void SetSize(size_t n) 
+  {
+    _n = n ;
+  }
+
+  const WORD* GetData() const
+  {
+    return _W ;
+  }
+  
+  // Return num elements starting from i.
+  // @return: bit packed _W[i].._W[i + num - 1]
+  WORD PackRead(size_t i, size_t num) const
+  {
+    return Utils::BitsRead(_W, i * _l, (i + num) * _l - 1) ;
+  }
+
+  WORD PackReadRev(size_t i, size_t num) const
+  {
+    size_t j ;
+    WORD ret = 0 ;
+    for (j = 0 ; j < num ; ++j)
+      ret = (ret << _l) + Read(i + j) ;
+    return ret ;
+  }
+
+  // Find the length of the matching prefix between A[s..e] and B[s..e]
+  //  assumes _l is the same
+  // If all match, return min(e-s+1, eb-sb+1)
+  size_t PrefixMatchLen(size_t s, size_t e, const FixedSizeElemArray &B, size_t sb, size_t eb) const
+  {
+    if (e >= _n)
+      e = _n - 1 ;
+    if (eb >= B._n)
+      eb = B._n - 1 ;
+    size_t ai ;
+    size_t bi ;
+
+    int block = WORDBITS / _l ;
+    ai = s ;
+    bi = sb ;
+    int len = MIN(e-s+1, eb-sb+1) ;
+    if (len < block)
+      block = len ;
+    if (block > 1)
+    {
+      for ( ; ai + block - 1 <= e && bi + block - 1 <= eb ; 
+          ai += block, bi += block)
+      {
+        WORD wa = PackRead(ai, block) ;
+        WORD wb = B.PackRead(bi, block) ;
+        if (wa == wb)
+          continue ;
+
+        int k ;
+        for (k = 0 ; k < block ; ++k)
+        {
+          WORD smalla = (wa >> (k * _l)) & MASK(_l) ;
+          WORD smallb = (wb >> (k * _l)) & MASK(_l) ;
+          if (smalla != smallb)
+            return ai + k - s ;
+        }
+      }
+    }
+
+    for ( ; ai <= e && bi <= eb ; ++ai, ++bi)
+    {
+      WORD smalla = Read(ai) ;
+      WORD smallb = B.Read(bi) ; 
+      if (smalla != smallb)
+        return ai - s ;
+    }
+
+    return MIN(e - s + 1, eb - sb + 1) ;
+  }
+  
+  // Compare A[s..e] and B[sb..eb]
+  // @return: sign(A-B)
+  int SubrangeCompare(size_t s, size_t e, const FixedSizeElemArray &B, size_t sb, size_t eb) const
+  {
+    if (_l != B._l)
+      return _l - B._l ;
+    if (e >= _n)
+      e = _n - 1 ;
+    if (eb >= B._n)
+      eb = B._n - 1 ;
+    size_t matchCnt = PrefixMatchLen(s, e, B, sb, eb) ;
+    
+    if (matchCnt == MIN(e - s + 1, eb - sb + 1))
+    {
+      if (e - s + 1 == eb - sb + 1)
+        return 0 ;
+      else if (e - s + 1 < eb - sb + 1)
+        return -1 ;
+      else
+        return 1 ;
+    }
+    else
+    {
+      WORD smalla = Read(s + matchCnt) ;
+      WORD smallb = B.Read(sb + matchCnt) ; 
+      
+      if (smalla < smallb) 
+        return -1 ;
+      else // they have to be different at this point 
+        return 1 ;
+    }
+  }
+  
+  // Malloc by copying the first p element of B
+  void InitFromOtherPrefix(const FixedSizeElemArray &B, size_t p)
+  {
+    Malloc(B._l, p) ;
+    size_t wordBytes = Utils::BitsToWordBytes(_n * _l) ;
+    memcpy(_W, B._W, wordBytes) ;
+  }
+
+  void Resize(size_t newn)
+  {
+    _n = newn ;
+    _size = Utils::BitsToWords(_l * newn) ;
+    _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ;
+  }
+  
+  // Reserve the space for m elements without changing current element
+  void Reserve(size_t m)
+  {
+    if (m <= _n)
+      return;
+
+    _size = Utils::BitsToWords(_l * m) ;
+    if (_W != NULL)
+      _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ;
+    else
+      _W = Utils::MallocByBits(_l * m) ;
+  }
+
+  // push back another element to the end of the array.
+  // This function also handles expand the array 
+  void PushBack(int x)
+  {
+    if (Utils::BitsToWords(_l * _n) == _size)
+      Reserve(2 * _n) ;
+    Write(_n, x);
+    ++_n ;
+  }
+
+  void Print(FILE *fp, char sep = ' ') const
+  {
+    size_t i ; 
+    for (i = 0 ; i < _n ; ++i)
+      fprintf(fp, "%d%c", (int)Read(i), sep) ;
+    fprintf(fp, "\n") ;
+  }
+
+  void Save(FILE *fp) 
+  {
+    SAVE_VAR(fp, _size) ;
+    SAVE_VAR(fp, _l) ;
+    SAVE_VAR(fp, _n) ;
+    fwrite(_W, sizeof(_W[0]), Utils::BitsToWords(_n * _l), fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    LOAD_VAR(fp, _size) ;
+    LOAD_VAR(fp, _l) ;
+    LOAD_VAR(fp, _n) ;
+    _W = Utils::MallocByBits(WORDBITS * _size) ;
+    fread(_W, sizeof(_W[0]), Utils::BitsToWords(_n * _l), fp) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/FractionBitElemArray.hpp b/compactds/FractionBitElemArray.hpp
new file mode 100644
index 0000000..6a1d68f
--- /dev/null
+++ b/compactds/FractionBitElemArray.hpp
@@ -0,0 +1,118 @@
+#ifndef _MOURISL_COMPACTDS_FRACTIONBITELEM_ARRAY
+#define _MOURISL_COMPACTDS_FRACTIONBITELEM_ARRAY
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <vector>
+
+#include "Utils.hpp"
+
+/*
+ * The class for the array where each element is in the range of [0..d-1] and _d is far from the power of 2
+ * The idea is that each WORD is a d-ary number (Section 3.1)
+ */
+
+namespace compactds {
+class FractionBitElemArray
+{
+private:
+  WORD *_W ;
+  const int _w ;
+  size_t _size ;
+  size_t _d ; // element is in the range of [0..d-1]
+  size_t _n ;
+  int _k ; // number of elements per word
+public:
+  FractionBitElemArray():_w(8 * sizeof(WORD)) 
+  {
+    _W = NULL ;
+  }
+
+  ~FractionBitElemArray() 
+  {
+    Free() ;
+  }
+
+  // Allocate the memory for _n elements, where each element is in the range of [0..d-1]
+  void Malloc(size_t d, size_t n)
+  {
+    this->_n = n ;
+    this->_d = d ;
+    _k = (int)(_w / ((double)log((double)_d) / (double)log(2.0))) ;
+    _size = DIV_CEIL(n, _k) ;
+    _W = Utils::MallocByBits(_size * WORDBITS) ;
+  }
+  
+  // in - input array
+  // n - the length of input array
+  void InitFromArray(size_t d, const unsigned int *in, const size_t &n)
+  {
+    size_t i ;
+    if (d == 0)
+    {
+      // We determine the best fixed size
+      d = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (in[i] > d)
+          d = in[i] ;
+      }
+      ++d ;
+    }
+
+    Malloc(d, n) ;
+    for (i = 0 ; i < _n ; ++i)
+      Write(i, in[i]) ;
+  }
+
+  void Free()
+  {
+    if (_W != NULL)
+      free(_W) ;
+    _W = NULL ;
+  }
+  
+  // Get the i-th element
+  unsigned Read(size_t i) const 
+  {
+    return (_W[i/_k] / Utils::PowerInt(_d, i%_k)) % _d ;
+  }
+
+  void Write(size_t i, int x)
+  {
+    size_t j = i / _k ;
+    size_t p = Utils::PowerInt(_d, i%_k) ;
+    _W[j] = _W[j] - ((_W[j] / p) %_d) * p + x * p ;
+  }
+
+  size_t GetSpace() const
+  {
+    return sizeof(_W[0]) * _size + sizeof(*this) ; 
+  }
+
+  int GetElemRange() const
+  {
+    return _d ;
+  }
+  
+  size_t GetSize() const
+  {
+    return _n ;
+  }
+
+  const WORD* GetData() const
+  {
+    return _W ;
+  }
+
+  void Resize(size_t newn)
+  {
+    _n = newn ;
+    _size = Utils::BitsToWords(DIV_CEIL(_n, _k)) ;
+    _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/HuffmanCode.hpp b/compactds/HuffmanCode.hpp
new file mode 100644
index 0000000..856d462
--- /dev/null
+++ b/compactds/HuffmanCode.hpp
@@ -0,0 +1,230 @@
+#ifndef _MOURISL_COMPACTDS_HUFFMANCODE
+#define _MOURISL_COMPACTDS_HUFFMANCODE
+
+#include <algorithm> 
+
+#include "Utils.hpp"
+
+namespace compactds {
+struct _huffman_node
+{
+  int symbol ;
+  uint64_t freq ;
+  int next ; // used in _tree construction as a linked list
+  int left, right ; // Left, right children
+  bool operator <(const struct _huffman_node &b) const
+  {
+    return freq < b.freq ; 
+  }
+} ;
+
+class HuffmanCode
+{
+private:
+  WORD *_codes ; // assume alphabet set is in [0, n-1].
+  int *_codeLens ; 
+  size_t _n ; // the size of the alphabet 
+  struct _huffman_node *_tree ;
+  size_t _space ;
+
+  // Algorithm 2.2: building a huffman _tree with linked list instead of heap
+  void BuildTree(struct _huffman_node *elems, size_t n)
+  {
+    std::sort(elems, elems + n) ;
+    
+    size_t i ;
+    size_t nodeCnt ;
+    
+    _tree = (struct _huffman_node *)malloc(sizeof(*_tree) * (2 * n - 1)) ;
+    _space += (sizeof(*_tree) * (2 * n - 1)) ;
+    
+    for (i = 0 ; i < n ; ++i)
+    {
+      _tree[i] = elems[i] ;
+      if (i + 1 < n)
+        _tree[i].next = i + 1 ;
+      else
+        _tree[i].next = -1 ;
+      _tree[i].left = _tree[i].right = -1 ;
+    }
+    size_t minTag = 0 ; // minTag and minTag+1 is the availble two nodes with minimum
+    size_t insertTag = 0 ; // the start position to search the next insert node
+                    // this marker is the key for linear time building the _tree after sorting.
+    nodeCnt = n ;
+    int p ;
+    while (1)
+    {
+      int a = minTag ;
+      int b = _tree[minTag].next ;
+      if (b == -1)
+        break ;
+      _tree[nodeCnt].symbol = -1 ;
+      _tree[nodeCnt].freq = _tree[a].freq + _tree[b].freq ;
+      _tree[nodeCnt].left = a ;
+      _tree[nodeCnt].right = b ;
+
+      // Search for the appropriate position to insert the new element
+      p = insertTag ;
+      while (_tree[p].next != -1 && _tree[ _tree[p].next ].freq <= _tree[nodeCnt].freq)
+        p = _tree[p].next ;
+      
+      _tree[nodeCnt].next = _tree[p].next ;
+      _tree[p].next = nodeCnt ;
+      
+      insertTag = nodeCnt ; 
+      ++nodeCnt ;
+      minTag = _tree[b].next ;
+    }
+  }
+
+  // Recurisvely traverse the Huffman _tree to put the code
+  void CreateCodes(int tag, WORD c, int l)
+  {
+    if (_tree[tag].left == -1 && _tree[tag].right == -1)
+    {
+      _codes[_tree[tag].symbol] = c ;
+      _codeLens[_tree[tag].symbol] = l ;
+      return ;
+    }
+
+    CreateCodes(_tree[tag].left, c<<1, l + 1) ;
+    CreateCodes(_tree[tag].right, (c<<1) + 1, l + 1) ;
+  }
+
+  void InternalInit(struct _huffman_node *elems, size_t n)
+  {
+    this->_n = n ;
+    _space = 0 ;
+
+    BuildTree(elems, n) ;
+    _codes = (WORD *)malloc(sizeof(*_codes) * n) ;
+    _codeLens = (int *)malloc(sizeof(*_codeLens) * n) ;
+    CreateCodes(2*n - 2, 0, 0) ; 
+  }
+
+public:
+  HuffmanCode() 
+  {
+    _n = _space = 0 ;
+    _codes = NULL ;
+    _codeLens = NULL ;
+    _tree = NULL ;
+  }
+  ~HuffmanCode() {Free();}
+
+  void Free()
+  {
+    _n = _space = 0 ;
+    if (_codes != NULL)
+    {
+      free(_codes) ; free(_codeLens) ; free(_tree) ;
+      _codes = NULL ;
+      _codeLens = NULL ;
+      _tree = NULL ;
+    }
+  }
+
+  size_t GetSpace()
+  {
+    return _space + sizeof(*this) ;
+  }
+  
+  int GetSize()
+  {
+    return _n ;
+  }
+
+  struct _huffman_node *GetTree() const
+  {
+    return _tree ;
+  }
+
+  size_t GetRoot() const
+  {
+    return 2 * _n - 2 ; 
+  }
+
+  HuffmanCode &operator =(const HuffmanCode &in)
+  {
+    Free() ;
+    
+    if (in._n == 0)
+      return *this;
+    _n = in._n ;
+    _space = in._space ;
+
+    _codes = (WORD *)malloc(sizeof(*_codes) * _n) ;
+    _codeLens = (int *)malloc(sizeof(*_codeLens) * _n) ;
+    _tree = (struct _huffman_node *)malloc(sizeof(*_tree) * (2*_n-1)) ;
+
+    memcpy(_codes, in._codes, sizeof(*_codes) * _n) ;
+    memcpy(_codeLens, in._codeLens, sizeof(*_codeLens) * _n) ;
+    memcpy(_tree, in._tree, sizeof(*_tree)) ;
+
+    return *this ;
+  }
+
+  void InitFromFrequency(const uint64_t *freq, const size_t n)
+  {
+    size_t i ;
+    struct _huffman_node *elems = (struct _huffman_node*)malloc(sizeof(*elems) * n);
+    
+    for (i = 0 ; i < n ; ++i)
+    {
+      elems[i].symbol = i ;
+      elems[i].freq = freq[i] ;
+    }
+    InternalInit(elems, n) ;
+
+    free(elems) ;
+  }
+
+  int GetDepth(int tag)
+  {
+    if (_tree[tag].left == -1)
+      return 0 ;
+    int ldepth = GetDepth(_tree[tag].left) ; 
+    int rdepth = GetDepth(_tree[tag].right) ;
+    return 1 + (ldepth > rdepth ? ldepth : rdepth) ;
+  }
+
+  WORD Encode(int x, int &l) const 
+  {
+    l = _codeLens[x] ;
+    return _codes[x] ;
+  }
+
+  int Decode(WORD c, int l) const
+  {
+    int i ;
+    int p = 2 * _n - 2 ; // root
+    for (i = 0 ; i < l ; ++i)
+    {
+      if ((c >> (l - i - 1)) & 1)
+        p = _tree[p].right ;
+      else
+        p = _tree[p].left ;
+    }
+    return _tree[p].symbol ;
+  }
+
+  void Save(FILE *fp)
+  {
+    fwrite(this, sizeof(this), 1, fp) ;
+    fwrite(_tree, sizeof(_tree[0]), 2 * _n - 1, fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    fread(this, sizeof(this), 1, fp) ;
+    _codes = (WORD *)malloc(sizeof(*_codes) * _n) ;
+    _codeLens = (int *)malloc(sizeof(*_codeLens) * _n) ;
+    _tree = (struct _huffman_node *)malloc(sizeof(*_tree) * (2*_n-1)) ;
+    fwrite(_tree, sizeof(_tree[0]), 2 * _n - 1, fp) ;
+    CreateCodes(2*_n - 2, 0, 0) ; 
+  }
+} ;
+}
+#endif
diff --git a/compactds/InterleavedFixedSizeElemArray.hpp b/compactds/InterleavedFixedSizeElemArray.hpp
new file mode 100644
index 0000000..2504def
--- /dev/null
+++ b/compactds/InterleavedFixedSizeElemArray.hpp
@@ -0,0 +1,238 @@
+#ifndef _MOURISL_COMPACTDS_INTERLEAVEDFIXEDSIZEELEM_ARRAY
+#define _MOURISL_COMPACTDS_INTERLEAVEDFIXEDSIZEELEM_ARRAY
+
+// The class handles two levels of arrays. 
+// Also a class where the first level is 64bit.
+
+#include "Utils.hpp"
+
+namespace compactds {
+class InterleavedFixedSizeElemArray
+{
+private:
+  size_t _l0, _l1 ; // length of element 0 and 1
+  size_t _n0 ;
+  size_t _f1 ; // frequency of element 1 after each element 0
+  size_t _size ; //memory size, in words 
+  WORD *_W ;
+public:
+  InterleavedFixedSizeElemArray()
+  {
+    _W = NULL ;
+    _size = 0 ;
+    _n0 = _f1 = 0 ;
+    _l0 = _l1 = 0 ;
+  }
+
+  ~InterleavedFixedSizeElemArray()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (_n0 > 0)
+    {
+      free(_W) ;
+      _W = NULL ;
+      _size = 0 ;
+      _n0 = _f1 = 0 ;
+      _l0 = _l1 = 0 ;
+    }
+  }
+  
+  size_t GetSpace() const
+  {
+    return sizeof(_W[0]) * _size + sizeof(*this) ; 
+  }
+
+  void Malloc(size_t l0, size_t n0, int l1, size_t f1) 
+  {
+    Free() ;
+
+    _l0 = l0 ;
+    _n0 = n0 ;
+    _l1 = l1 ;
+    _f1 = f1 ;
+    _size = Utils::BitsToWords(l0 * n0 + l1 * n0 * f1) ;
+    _W = (WORD *)malloc(_size * sizeof(WORD)) ;
+  }
+
+  void Resize(size_t newn1)
+  {
+    _n0 = newn1 ;
+    _size = Utils::BitsToWords(_l0 * _n0 + _l1 * _n0 * _f1) ;
+    _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ;
+  }
+
+  int GetElem0Length() const
+  {
+    return _l0 ;
+  }
+
+  int GetElem1Length() const
+  {
+    return _l1 ;
+  }
+
+  size_t GetSize0() const
+  {
+    return _n0 ;
+  }
+
+  size_t GetSize1() const
+  {
+    return _n0 * _f1 ;
+  }
+
+  void SetSize(size_t n0)
+  {
+    _n0 = n0 ;
+  }
+
+  WORD Read(int type, size_t i) const
+  {
+    if (type == 0)
+    {
+      const size_t offset = i * (_l0 + _f1 * _l1) ;
+      return Utils::BitsRead(_W, offset, offset + _l0 - 1) ;
+    }
+    else
+    {
+      const size_t offset = (i / _f1) * (_l0 + _f1 * _l1) + _l0 + _l1 * (i%_f1);
+      return Utils::BitsRead(_W, offset, offset + _l1 - 1) ;
+    }
+  }
+
+  void Write(size_t type, size_t i, int x)
+  {
+    if (type == 0)
+    {
+      const size_t offset = i * (_l0 + _f1 * _l1) ;
+      Utils::BitsWrite(_W, offset, offset + _l0 - 1, x) ;
+    }
+    else
+    {
+      const size_t offset = (i / _f1) * (_l0 + _f1 * _l1) + _l0 + _l1 * (i%_f1);
+      Utils::BitsWrite(_W, offset, offset + _l1 - 1, x) ;
+    }
+  }
+} ;
+
+// Optimized for level 0 is 64bit integer. 
+//  The second level will be paded 
+class Interleaved64FixedSizeElemArray
+{
+private:
+  size_t _l1 ; // length of element 0 and 1
+  size_t _n0 ;
+  size_t _f1 ; // frequency of element 1 after each element 0
+  size_t _size ; //memory size 
+  WORD *_W ;
+  size_t _b ; // block size for each element 0 and attached element 1, in words
+public:
+  Interleaved64FixedSizeElemArray()
+  {
+    _W = NULL ;
+    _size = 0 ;
+    _n0 = _f1 = 0 ;
+    _l1 = 0 ;
+  }
+
+  ~Interleaved64FixedSizeElemArray()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (_n0 > 0)
+    {
+      free(_W) ;
+      _W = NULL ;
+      _size = 0 ;
+      _n0 = _f1 = 0 ;
+      _l1 = 0 ;
+    }
+  }
+  
+  size_t GetSpace() const
+  {
+    return sizeof(_W[0]) * _size + sizeof(*this) ; 
+  }
+
+  void Malloc(size_t n0, int l1, size_t f1) 
+  {
+    Free() ;
+
+    _n0 = n0 ;
+    _l1 = l1 ;
+    _f1 = f1 ;
+    _b = Utils::BitsToWords(WORDBITS + DIV_CEIL(l1 * f1, WORDBITS) * WORDBITS) ;
+    _size = Utils::BitsToWords(_n0 * _b * WORDBITS) ;
+    _W = (WORD *)malloc(_size * sizeof(WORD)) ;
+  }
+
+  void Resize(size_t newn1)
+  {
+    _n0 = newn1 ;
+    _size = Utils::BitsToWords(_n0 * _b * WORDBITS) ;
+    _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ;
+  }
+
+  int GetElemr0Length() const
+  {
+    return 64 ;
+  }
+
+  int GetElem2Length() const
+  {
+    return _l1 ;
+  }
+
+  size_t GetSize1() const
+  {
+    return _n0 ;
+  }
+
+  size_t GetSize2() const
+  {
+    return _n0 * _f1 ;
+  }
+
+  void SetSize(size_t n0)
+  {
+    _n0 = n0 ;
+  }
+
+  WORD Read0(size_t i) const
+  {
+    return _W[i * _b] ; 
+  }
+
+  WORD Read1(size_t i) const
+  {
+    const size_t tmp = i / _f1 ;
+    const size_t offset = (tmp * _b + 1)* WORDBITS + (i - tmp * _f1) * _l1 ;
+    return Utils::BitsRead(_W, offset, offset + _l1 - 1 ) ;
+  }
+
+  void Write0(size_t i, WORD x)
+  {
+    _W[i * _b] = x ;
+  }
+
+  void Write1(size_t i, int x)
+  {
+    const size_t tmp = i / _f1 ;
+    const size_t offset = (tmp * _b + 1)* WORDBITS + (i - tmp * _f1) * _l1 ;
+    Utils::BitsWrite(_W, offset, offset + _l1 - 1, x ) ;
+  }
+} ;
+
+
+typedef InterleavedFixedSizeElemArray ILArray ;
+typedef Interleaved64FixedSizeElemArray IL64Array ;
+}
+
+#endif 
diff --git a/compactds/InvertedIndex.hpp b/compactds/InvertedIndex.hpp
new file mode 100644
index 0000000..35ac2e0
--- /dev/null
+++ b/compactds/InvertedIndex.hpp
@@ -0,0 +1,131 @@
+#ifndef _MOURISL_COMPACTDS_INVERTEDINDEX
+#define _MOURISL_COMPACTDS_INVERTEDINDEX
+
+// Use permutation to represent inverted index
+
+#include "Utils.hpp"
+#include "FixedSizeElemArray.hpp"
+#include "Permutation.hpp"
+#include "Bitvector_Plain.hpp"
+#include "CompactMapper.hpp"
+
+namespace compactds {
+class InvertedIndex
+{
+private:
+  size_t _n ;
+  Permutation _pi ; 
+  Bitvector_Plain _D ; // marker of the start position for each number/alphabet in the concatendated permutation list.
+  CompactMapper _map ;
+  size_t _space ;
+
+public:
+  InvertedIndex() 
+  {
+  }
+
+  ~InvertedIndex()
+  {
+  }
+
+  size_t GetSpace(bool inclusive = true)
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ; 
+  }
+
+  void Init(const FixedSizeElemArray &list, size_t n, bool sparseMap)
+  {
+    size_t i ;
+    _n = n ;
+
+    _map.Init(list, n, sparseMap) ;
+     
+    size_t *pi = (size_t *)malloc(sizeof(*pi) * _n) ;
+    size_t *psum = (size_t *)calloc(sizeof(*psum), _n) ;
+    for (i = 0 ; i < _n ; ++i)
+    {
+      ++psum[ _map.Map(list.Read(i)) ] ;
+    }
+  
+    size_t m = _map.GetCompactSize() ;
+
+    _D.Malloc(_n) ;
+    _D.BitSet(0) ;
+    for (i = 1 ; i < m ; ++i)
+    {
+      psum[i] += psum[i - 1] ;
+      _D.BitSet(psum[i - 1]) ;
+    }
+    for (i = m - 1 ; i > 0 ; --i)
+      psum[i] = psum[i - 1] ;
+    psum[0] = 0 ;
+    _D.Init() ;
+
+    for (i = 0 ; i < _n ; ++i)
+    {
+      size_t tmp = _map.Map(list.Read(i)) ;
+      pi[ psum[tmp] ] = i ;
+      ++psum[tmp] ;
+    }
+    _pi.Init(pi, n) ;
+
+    free(pi) ;
+    free(psum) ;
+  }
+
+  // Search the ith occurence label l (0-based)
+  size_t Search(size_t l, size_t i) const 
+  {
+    size_t mapl = _map.Map(l) ;
+    return _pi.Next( _D.Select(mapl + 1) + i) ;  
+  }
+
+  // @return: the number of positions for label l
+  size_t Positions(size_t l, std::vector<size_t> &pos) const
+  {
+    size_t mapl = _map.Map(l) ;
+    size_t i, cnt ;
+    if (mapl == _map.GetCompactSize() - 1)
+      cnt = _n - _D.Select(mapl + 1) ;
+    else
+      cnt = _D.Select(mapl + 2) - _D.Select(mapl) ;
+
+    size_t start = _D.Select(mapl + 1) ;
+    for (i = 0 ; i < cnt ; ++i)
+      pos.push_back( _pi.Next(start + i) ) ;
+
+    return cnt ;
+  }
+
+  // Count the number of label l in the sequences
+  size_t Count(size_t l) const
+  {
+    size_t mapl = _map.Map(l) ;
+    if (mapl == _map.GetCompactSize() - 1)
+      return _n - _D.Select(mapl + 1) ;
+    else
+      return _D.Select(mapl + 2) - _D.Select(mapl) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _n) ;    
+    SAVE_VAR(fp, _space) ;   
+    _pi.Save(fp) ;
+    _D.Save(fp) ;
+    _map.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _space) ;
+    _pi.Load(fp) ;
+    _D.Load(fp) ;
+    _map.Load(fp) ;
+  }
+} ;
+
+}
+
+#endif
diff --git a/compactds/Makefile b/compactds/Makefile
new file mode 100644
index 0000000..65ad2ea
--- /dev/null
+++ b/compactds/Makefile
@@ -0,0 +1,31 @@
+CXX = g++
+CXXFLAGS= -Wall -g -msse4.2 -O3 #-pg -g #-Wall #-O3
+LINKPATH= 
+LINKFLAGS = -lpthread -lz 
+DEBUG=
+OBJECTS =  #BaseReads.o Alignment.o 
+HEADERS = *.hpp
+
+#asan=1
+ifneq ($(asan),)
+	CXXFLAGS+=-fsanitize=address -g
+	LDFLAGS+=-fsanitize=address -ldl -g
+endif
+
+#all: bitvector-benchmark #test #bitvector-benchmark
+all: test #rbbwt #bitvector-benchmark
+
+test: test.o $(OBJECTS)
+	$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $< $(OBJECTS) $(LINKFLAGS)
+
+bitvector-benchmark: bitvector_benchmark.cpp $(HEADERS)
+	$(CXX) -o $@ $(LINKPATH) $< $(LINKFLAGS) -std=c++11 -O3 -ffast-math -funroll-loops -msse4.2 -march=native -DHAVE_CXA_DEMANGLE
+	#$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $< $(OBJECTS) -std=c++11 $(LINKFLAGS)
+
+rbbwt: rbbwt.cpp $(HEADERS)
+	$(CXX) -o $@ $(LINKPATH) $< $(LINKFLAGS) -g -std=c++11 -Ofast -march=native -fstrict-aliasing 
+
+test.o: test.cpp $(HEADERS) 
+
+clean:
+	rm -f *.o *.gch test
diff --git a/compactds/PartialSum.hpp b/compactds/PartialSum.hpp
new file mode 100644
index 0000000..124a851
--- /dev/null
+++ b/compactds/PartialSum.hpp
@@ -0,0 +1,140 @@
+#ifndef _MOURISL_COMPACTDS_PARTIALSUM
+#define _MOURISL_COMPACTDS_PARTIALSUM
+
+#include "Utils.hpp"
+#include "Bitvector_Sparse.hpp"
+
+namespace compactds {
+class PartialSum
+{
+private:
+  Bitvector_Sparse _B ; // underlying sparse bit vector
+  size_t _n ;
+  uint64_t _totalSum ;
+public:
+  PartialSum()
+  {
+    _n = _totalSum = 0 ;
+  }
+
+  ~PartialSum()
+  {
+    Free() ;
+  }
+  
+  int GetSpace() 
+  {
+    return _B.GetSpace() + sizeof(*this) ;
+  }
+
+  void Free()
+  {
+    _B.Free() ;
+  }
+  
+  void SetSupportSearch(bool supportSearch) 
+  {
+    _B.SetSupportRank(supportSearch) ;
+  }
+
+  void SetSpeed(int speed)
+  {
+    _B.SetSpeed(speed) ;
+  }
+  
+  void Init(const int *array, const size_t n)
+  {
+    size_t i ;
+    uint64_t *psum ;
+    psum = (uint64_t *)malloc(sizeof(*psum) * (n+1)) ; // We store an extra element for all the length in sum
+
+    psum[0] = 0 ;
+    for (i = 1 ; i < n + 1 ; ++i) 
+      psum[i] = psum[i - 1] + array[i - 1] ;
+    InitFromPartialSum(psum, n) ;
+    free(psum) ;
+  }
+  
+  void Init(const size_t *array, const size_t n)
+  {
+    size_t i ;
+    uint64_t *psum ;
+    psum = (uint64_t *)malloc(sizeof(*psum) * (n+1)) ; // We store an extra element for all the length in sum
+
+    psum[0] = 0 ;
+    for (i = 1 ; i < n + 1 ; ++i) 
+      psum[i] = psum[i - 1] + array[i - 1] ;
+    InitFromPartialSum(psum, n) ;
+    free(psum) ;
+  }
+
+  // n is the number of elements
+  // psum records the partial sum before the i-th element
+  // the last element should be the total sum, so need to store psum[n]  
+  void InitFromPartialSum(const uint64_t *psum, const size_t n)
+  {
+    this->_n = n ;
+    this->_totalSum = psum[n] ;
+    _B.InitFromOnes(psum, n + 1, _totalSum) ;
+  }
+
+  // Initalize where the numbers are marked on bit vector
+  //    i.e., the partial sum is the index on the bit array
+  //    It assumes the lowest bit of W[0] is 1, and the last 
+  //    index corresponds to the total sum
+  void InitFromBitvector(WORD *W, const size_t wsize)
+  {
+    _B.Init(W, wsize) ;
+    _n = _B.GetOneCnt() - 1 ;
+    _totalSum = _B.GetLastOneIdx() ;
+  }
+  
+  // Get the partial sum for index i
+  // sum_0^[i-1] A[j]
+  // Another interpretation is the summation for the first i elements.
+  uint64_t Sum(size_t i) const
+  {
+    if (i == 0)
+      return 0 ;
+    else if (i >= _n)
+      return _totalSum ;
+    else
+      // The input to Select is 1-based
+      return _B.Select(i + 1) ;    
+  }
+
+  // Return the max i that Sum(i) <= the value of v
+  size_t Search(const uint64_t v) const
+  {
+    if (v >= _totalSum)
+      return _n ;
+    return _B.Rank(1, (size_t)v) - 1 ;
+  }
+  
+  // Read the value of an element
+  int AccessValue(size_t i) const
+  {
+    if (i >= _n)
+      return -1 ;
+    return (int)(Sum(i + 1) - Sum(i)) ; 
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _n);
+    SAVE_VAR(fp, _totalSum);
+    _B.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    LOAD_VAR(fp, _n);
+    LOAD_VAR(fp, _totalSum);
+    _B.Load(fp) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/PerfectHash.hpp b/compactds/PerfectHash.hpp
new file mode 100644
index 0000000..5322fbf
--- /dev/null
+++ b/compactds/PerfectHash.hpp
@@ -0,0 +1,199 @@
+#ifndef _MOURISL_COMPACTDS_PERFECTHASH
+#define _MOURISL_COMPACTDS_PERFECTHASH
+
+// Generate a perfect hash function given the set of keys
+#include "UniversalHashGenerator.hpp"
+#include "FractionBitElemArray.hpp"
+#include "Bitvector_Plain.hpp"
+#include "SimpleVector.hpp"
+
+#define PERFECT_MAP_KEY_TRIES 3
+
+namespace compactds {
+class PerfectHash
+{
+private:
+  UniversalHashGenerator uh ;
+  uint64_t a[PERFECT_MAP_KEY_TRIES], b[PERFECT_MAP_KEY_TRIES] ; // the parameters from the universal hash function
+  FractionBitElemArray G ;
+  size_t m ;
+
+  // Map with hash, this include the shift
+  uint64_t MapWithHashI(uint64_t key, int i)
+  {
+    return uh.Map(a[i], b[i], key) + i * (m/PERFECT_MAP_KEY_TRIES) ;
+  }
+
+  // The method is to give each key three potential slots,
+  // the goal is to find a map that each slot is assigned by a unique key (one of the three).
+  // So we process all the keys first, and start from the slot with unique key already
+  // and release the assignment from the other two slots of the key.
+  // This may release more slots with unique keys, and we repeat this process
+  // If there are still ambiguous keys, we return FAIL(0)
+  //
+  // I think my implementation is better than the one suggested in the textbook,
+  //  as it does not need to store the tuple and nodes/link map and also use queue instead 
+  //  of priority_queue which also gives linear time speed
+  int InitTry(uint64_t *keys, size_t n, SimpleVector<int> *L, size_t *nL,
+      size_t *uniqueSlotQueue, WORD *keyIdxProcessed, size_t *S)
+  {
+    size_t i ;
+    int j ; // in this function, j is to iterate hash function tries
+    size_t Scnt = 0 ;
+    size_t uniqueSlotQueueS, uniqueSlotQueueE ;
+
+    // Initialize some parametrs.
+    for (i = 0 ; i < m ; ++i)
+      L[i].Clear() ;
+    memset(keyIdxProcessed, 0, Utils::BitsToWordBytes(m)) ;
+    uniqueSlotQueueS = 0 ; uniqueSlotQueueE = 0 ; //[S..E)
+
+    for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j)
+      uh.Generate(a[j], b[j]) ;
+
+    // Put all the keys to their slots
+    for (i = 0 ; i < n ; ++i)
+    {
+      for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j)
+      {
+        uint64_t target = MapWithHashI(keys[i], j);
+        //printf("%llu %llu %d: %d %d: %d\n", a[j], b[j], m, i, j, target) ;
+        L[target].PushBack(i) ;
+      }
+    }
+    
+    // Initialize the unique slot queue
+    for (i = 0 ; i < m ; ++i)
+    {
+      if (L[i].Size() == 1)
+      {
+        uniqueSlotQueue[uniqueSlotQueueE] = i ;
+        ++uniqueSlotQueueE ;
+      }
+      nL[i] = L[i].Size() ;
+    }
+
+    // main part, identify which slot is unique for a key until now
+    while (uniqueSlotQueueS < uniqueSlotQueueE)
+    {
+      size_t slot = uniqueSlotQueue[uniqueSlotQueueS] ;
+      ++uniqueSlotQueueS ;
+      // Since each slot will be removed once
+      // and the total length of the list PERFECT_MAP_KEY_TRIES*n 
+      // , the overall time is still O(n)
+      size_t size = L[slot].Size() ;
+      size_t keyIdx = -1;
+      for (i = 0 ; i < size ; ++i)
+      {
+        if (!Utils::BitRead(keyIdxProcessed, L[slot][i]))
+        {
+          keyIdx = L[slot][i] ;
+          break ;
+        }
+      }
+      if (i >= size)
+      {
+        // The l becomes empty, this could happen when a key
+        // creates more than one unique-mapped slots
+        continue ;
+      }
+      Utils::BitSet(keyIdxProcessed, keyIdx) ;
+      S[Scnt] = keys[keyIdx] ;
+      ++Scnt ;
+      for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j)
+      {
+        uint64_t target = MapWithHashI(keys[keyIdx], j) ; 
+        --nL[target] ;
+        if (nL[target] == 1) // it could be 0, so we should not use <=1
+        {
+          uniqueSlotQueue[uniqueSlotQueueE] = target ;
+          ++uniqueSlotQueueE ;
+        }
+      }
+    }
+    if (Scnt < n)
+      return 0 ;
+    G.Malloc(3, m) ; // The value of G is {0, 1, 2}
+    WORD *V = Utils::MallocByBits(m) ;
+    for (i = 1 ; i <= Scnt ; ++i)
+    {
+      size_t key = keys[S[Scnt - i]] ;  
+      uint64_t targets[PERFECT_MAP_KEY_TRIES] ;
+      int gSumMod = 0 ;
+      for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j)
+      {
+        targets[j] = MapWithHashI(key, j) ;
+        gSumMod += G.Read(targets[j]) % PERFECT_MAP_KEY_TRIES ;
+      }
+      for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j)
+      {
+        if (!Utils::BitRead(V, targets[j]))
+        {
+          int tmp = (j - gSumMod) % PERFECT_MAP_KEY_TRIES ;
+          if (tmp < 0)
+            tmp += PERFECT_MAP_KEY_TRIES ;
+          G.Write(targets[j], tmp) ;
+          break ;
+        }
+      }
+
+      for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j)
+        Utils::BitSet(V, targets[j]) ;
+    }
+    free(V) ;
+    return 1 ;
+  }
+public:
+  PerfectHash() {}
+  ~PerfectHash() {}
+  
+  size_t GetSpace() 
+  {
+    return G.GetSpace() - sizeof(G) + sizeof(*this) ;
+  }
+
+  void Init(uint64_t *keys, size_t n, size_t m)  
+  {
+    if (m == 0)
+      m = CEIL(1.25 * n / PERFECT_MAP_KEY_TRIES) * PERFECT_MAP_KEY_TRIES; 
+    this->m = m ;
+    SimpleVector<int> *L ; // the key list associated with each slot 
+    size_t *nL ; // number of element in each L
+    size_t *uniqueSlotQueue ; // the queue for slot with unique keys     
+    size_t *S ; // the stack used to store keys 
+    WORD *keyIdxProcessed ; // bit vector represent whether a key has been processed
+
+    L = new SimpleVector<int>[m] ;
+    nL = (size_t *)malloc(sizeof(size_t) * m) ;
+    uniqueSlotQueue = (size_t *)malloc(sizeof(size_t) * m) ;
+    S = (size_t *)malloc(sizeof(size_t) * n) ;
+    keyIdxProcessed = Utils::MallocByBits(n) ;
+
+    uh.Init(m/PERFECT_MAP_KEY_TRIES, 0) ;
+
+    while (!InitTry(keys, n, L, nL, uniqueSlotQueue, keyIdxProcessed, S))
+      ;
+
+    delete[] L ;
+    free(nL) ;
+    free(uniqueSlotQueue) ;
+    free(S) ;
+    free(keyIdxProcessed) ;
+  }
+
+  uint64_t Map(uint64_t x)
+  {
+    size_t i ;
+    uint64_t hs[PERFECT_MAP_KEY_TRIES] ;
+    int gsum = 0 ;
+    for (i = 0 ; i < PERFECT_MAP_KEY_TRIES ; ++i)
+    {
+      hs[i] = MapWithHashI(x, i) ; 
+      gsum += G.Read(hs[i]) ;
+    }
+    return hs[gsum %PERFECT_MAP_KEY_TRIES] ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Permutation.hpp b/compactds/Permutation.hpp
new file mode 100644
index 0000000..a1fac23
--- /dev/null
+++ b/compactds/Permutation.hpp
@@ -0,0 +1,237 @@
+#ifndef _MOURISL_COMPACTDS_PERMUTATION
+#define _MOURISL_COMPACTDS_PERMUTATION
+
+// Compressed permutation representation. Chapter 5.3
+// So far it assuems at most 2^31 runs
+#include "Utils.hpp"
+#include "HuffmanCode.hpp"
+#include "Bitvector_Plain.hpp"
+
+namespace compactds {
+class Permutation
+{
+private:
+  size_t _space ;
+  size_t _n ;
+  size_t _rcnt ;
+  Bitvector_Plain *_nodeB ; // The left, right child indicator 
+  Bitvector_Plain _G ; // Mark the start position for each run in the permuation representation.
+  int *_nodePath ; //Buffer to holde the node ids along a path from root to leaf
+  HuffmanCode _huffmanTree ;
+
+  // Combine the CreateLeaves and CreateBitvectors of the book into the same function
+  // Using S to hold the sequences mimicing merge sort
+  // tag: tree id
+  void CreateBitvectors(const struct _huffman_node *tree, int tag, size_t *S, size_t offset, size_t *Pi)
+  {
+    size_t i ;
+    if (tree[tag].left == -1)
+    {
+      // Leaf
+      for (i = 0 ; i < tree[tag].freq ; ++i)
+        S[offset + i] = Pi[ _G.Select(1, tree[tag].symbol + 1) + i] ;
+    }
+    else
+    {
+      CreateBitvectors(tree, tree[tag].left, S, offset, Pi) ;
+      CreateBitvectors(tree, tree[tag].right, S, offset + tree[tree[tag].left].freq, Pi) ;
+
+      _nodeB[tag].Malloc(tree[tag].freq) ;
+      _space += (_nodeB[tag].GetSpace() - sizeof(_nodeB[tag])) ;
+
+      // Merge
+      size_t *buffer = (size_t *)malloc(sizeof(size_t) * tree[tag].freq) ;
+      size_t lp = offset, rp = offset + tree[tree[tag].left].freq ; // left/right pointer
+      size_t lcnt, rcnt ; // scanned count of left and right children
+      lcnt = rcnt = 0 ;
+      while (lcnt < tree[tree[tag].left].freq && rcnt < tree[ tree[tag].right ].freq)
+      {
+        if (S[lp] < S[rp])
+        {
+          buffer[lcnt + rcnt] = S[lp] ;
+          ++lcnt ; ++lp ;
+        }
+        else if (S[lp] > S[rp])
+        {
+          buffer[lcnt + rcnt] = S[rp] ;
+          _nodeB[tag].BitSet(lcnt + rcnt) ;
+          ++rcnt ; ++rp ;
+        }
+        else
+        {
+          // ERROR!
+        }
+      }
+      while (lcnt < tree[tree[tag].left].freq)
+      {
+        buffer[lcnt + rcnt] = S[lp] ;
+        ++lcnt ; ++lp ;
+      }
+      while (rcnt < tree[tree[tag].right].freq)
+      {
+        buffer[lcnt + rcnt] = S[rp] ;
+        _nodeB[tag].BitSet(lcnt + rcnt) ;
+        ++rcnt ; ++rp ;
+      }
+      _nodeB[tag].Init() ;
+      for (i = offset ; i < offset + tree[tag].freq ; ++i)
+        S[i] = buffer[i - offset] ;
+      free(buffer) ;
+    }
+  }
+public:
+  Permutation() 
+  {
+    _space = 0 ;
+    _n = 0 ;
+    _rcnt = 0 ;
+  }
+
+  ~Permutation()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (_n > 0)
+    {
+      delete[] _nodeB ;
+      free(_nodePath) ;
+      _n = 0 ;
+    }
+  }
+
+  size_t GetSpace()
+  {
+    return _space + sizeof(*this) ; 
+  }
+
+  void Init(size_t *Pi, size_t n)
+  {
+    _n = n ;
+    size_t i, j ;
+    std::vector<size_t> rstarts ;
+    std::vector<uint64_t> rlens ;
+    _G.Malloc(n) ;
+    for (i = 0 ; i < n ;)
+    {
+      for (j = i + 1 ; j < n ; ++j)
+        if (Pi[j] < Pi[j - 1])
+          break ;
+      rstarts.push_back(i) ;
+      rlens.push_back(j - i) ;
+      _G.BitSet(i) ;
+      i = j ;
+    }
+    _rcnt = rstarts.size() ;
+    _G.Init() ;
+    _space += _G.GetSpace() - sizeof(_G) ;
+
+    _huffmanTree.InitFromFrequency(rlens.data(), _rcnt) ;
+    _space += _huffmanTree.GetSpace() - sizeof(_huffmanTree) ;
+    
+    int depth = _huffmanTree.GetDepth( _huffmanTree.GetRoot() ) ;
+    _nodePath = (int *)malloc(sizeof(_nodePath[0]) * (depth + 1)) ;
+    _space += sizeof(_nodePath[0]) * (depth + 1) ;
+  
+    _nodeB = new Bitvector_Plain[2 * _rcnt - 1] ; 
+    _space += sizeof(Bitvector_Plain) * (2 * _rcnt - 1) ; 
+    size_t *S = (size_t *)malloc(sizeof(size_t) * n) ;
+    CreateBitvectors(_huffmanTree.GetTree(), _huffmanTree.GetRoot(), S, 0, Pi) ;
+    /*for (i = 0 ; i < _rcnt ; ++i)
+    {
+      int l ;
+      WORD code = _huffmanTree.Encode(i, l) ;
+      printf("%d %d %d: %d %d\n", i, rstarts[i], rlens[i], code, l) ;
+    }*/
+    free(S) ;
+  }
+
+  // Pi(i)
+  // read() in the book
+  size_t Next(size_t i) const
+  {
+    int j ;
+    const struct _huffman_node *tree = _huffmanTree.GetTree() ;
+    int len ;
+    size_t ri = _G.Rank(1, i) - 1 ; 
+    WORD code = _huffmanTree.Encode(ri, len) ;
+    
+    _nodePath[0] = _huffmanTree.GetRoot() ;
+    for (j = 0 ; j < len ; ++j)
+    {
+      if ((code >> (len - j - 1)) & 1)
+        _nodePath[j + 1] = tree[ _nodePath[j] ].right ;
+      else
+        _nodePath[j + 1] = tree[ _nodePath[j] ].left ;
+    }
+    
+    i = i - _G.Select(1, ri + 1) ;
+    for (j = len - 1 ; j >= 0 ; --j)
+    {
+      if ((code >> (len - 1 - j)) & 1)
+      {
+        i = _nodeB[_nodePath[j]].Select(1, i + 1) ;
+      }
+      else
+      {
+        i = _nodeB[_nodePath[j]].Select(0, i + 1) ;
+      }
+    }
+    return i ;
+  }
+
+  // Pi^-1(i)
+  // inverse() in the book
+  size_t Prev(size_t i) const
+  {
+    size_t j = i ; // tracking the position of i in a run
+    const struct _huffman_node *tree = _huffmanTree.GetTree() ; 
+    size_t tag = _huffmanTree.GetRoot() ;
+    while (tree[tag].left != -1)
+    {
+      int b = _nodeB[tag].Access(j) ;
+      j = _nodeB[tag].Rank(b, j, /*inclusive=*/0) ;
+      if (b == 0)
+        tag = tree[tag].left ;
+      else
+        tag = tree[tag].right ;
+    }
+
+    return _G.Select(1, tree[tag].symbol + 1) + j ;
+  }
+
+  void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _n) ;
+    SAVE_VAR(fp, _space) ;
+    SAVE_VAR(fp, _rcnt) ;
+    _G.Save(fp) ;
+    _huffmanTree.Save(fp) ;
+    size_t i ;
+    for (i = 0 ; i < 2 * _rcnt - 1 ; ++i)
+      _nodeB[i].Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    LOAD_VAR(fp, _n) ;
+    LOAD_VAR(fp, _space) ;
+    LOAD_VAR(fp, _rcnt) ;
+    _G.Load(fp) ;
+    _huffmanTree.Load(fp) ;
+    size_t i ;
+    _nodeB = new Bitvector_Plain[2 * _rcnt - 1] ; 
+    for (i = 0 ; i < 2 * _rcnt - 1 ; ++i)
+      _nodeB[i].Load(fp) ;
+    
+    int depth = _huffmanTree.GetDepth( _huffmanTree.GetRoot() ) ;
+    _nodePath = (int *)malloc(sizeof(_nodePath[0]) * (depth + 1)) ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Sequence.hpp b/compactds/Sequence.hpp
new file mode 100644
index 0000000..359efe9
--- /dev/null
+++ b/compactds/Sequence.hpp
@@ -0,0 +1,48 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE
+#define _MOURISL_COMPACTDS_SEQUENCE
+
+#include "Utils.hpp"
+#include "Alphabet.hpp"
+#include "FixedSizeElemArray.hpp"
+
+namespace compactds {
+class Sequence
+{
+protected:
+  size_t _space ;
+  Alphabet _alphabets ;
+  size_t _n ; // sequence length
+public:
+  Sequence() {_space = 0 ; _n = 0 ;}
+  ~Sequence() {}
+
+  void SetAlphabet(const Alphabet &a)
+  {
+    _alphabets = a ;  
+  }
+
+  virtual void Save(FILE *fp)
+  {
+    SAVE_VAR(fp, _space) ;
+    SAVE_VAR(fp, _n) ;
+    _alphabets.Save(fp) ;
+  }
+
+  virtual void Load(FILE *fp)
+  {
+    LOAD_VAR(fp, _space) ;
+    LOAD_VAR(fp, _n) ;
+    _alphabets.Load(fp) ;
+  }
+  
+  virtual void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) = 0 ;
+  virtual void Free() = 0 ;
+  virtual size_t GetSpace() = 0 ; 
+  virtual ALPHABET Access(size_t i) const = 0 ;
+  virtual size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const = 0 ;
+  virtual size_t Select(ALPHABET c, size_t i) const = 0 ;
+  virtual void PrintStats() = 0 ;
+} ;
+}
+
+#endif
diff --git a/compactds/SequenceCompactor.hpp b/compactds/SequenceCompactor.hpp
new file mode 100644
index 0000000..f7b0f70
--- /dev/null
+++ b/compactds/SequenceCompactor.hpp
@@ -0,0 +1,76 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCECOMPACTOR
+#define _MOURISL_COMPACTDS_SEQUENCECOMPACTOR
+
+// The class that handles convert the raw sequence to FixedSizeElemArray
+// I put this class in compactds because FM and Sequence classes assumes 
+//  the input is from compact representation
+#include "FixedSizeElemArray.hpp"
+#include "Alphabet.hpp"
+
+namespace compactds {
+class SequenceCompactor
+{
+private:
+  bool _capitalize ;
+  ALPHABET _missingReplace ; 
+  Alphabet _alphabets ;
+public: 
+  SequenceCompactor() 
+  {
+    _capitalize = false ;
+    _missingReplace = '\0' ;  
+  };
+
+  ~SequenceCompactor() {} ;
+  
+  void Init(const char *alphabetList)
+  {
+    _alphabets.InitFromList(alphabetList, strlen(alphabetList)) ;
+  }
+
+  void Init(const char *alphabetList, FixedSizeElemArray &compactSeq, size_t reserveLength)
+  {
+    int alphabetCodeLen = _alphabets.InitFromList(alphabetList, strlen(alphabetList)) ;
+    compactSeq.Malloc(alphabetCodeLen, reserveLength) ; 
+    compactSeq.SetSize(0) ;
+  }
+
+  void SetCapitalize(bool c)
+  {
+    _capitalize = c ;
+  }
+
+  void SetMissingReplace(ALPHABET c)
+  {
+    _missingReplace = c ;
+  }
+
+  // @return: number of chars added to seq
+  size_t Compact(const char *rawseq, FixedSizeElemArray &seq) 
+  {
+    size_t i ;
+    size_t origLen = seq.GetSize() ;
+    for (i = 0 ; rawseq[i] ; ++i)
+    {
+      char c = rawseq[i] ;
+      if (_capitalize)
+      {
+        if (c >= 'a' && c <= 'z')
+          c = c - 'a' + 'A' ;
+      }
+
+      if (!_alphabets.IsIn(c))
+      {
+        if (_missingReplace == '\0')
+          continue ;
+        else
+          c = _missingReplace ;
+      }
+      seq.PushBack( _alphabets.Encode(c) ) ;
+    }
+
+    return seq.GetSize() - origLen ;
+  }
+} ;
+} 
+#endif
diff --git a/compactds/Sequence_Hybrid.hpp b/compactds/Sequence_Hybrid.hpp
new file mode 100644
index 0000000..5bfdbd7
--- /dev/null
+++ b/compactds/Sequence_Hybrid.hpp
@@ -0,0 +1,328 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE_HYBRID
+#define _MOURISL_COMPACTDS_SEQUENCE_HYBRID
+
+#include "Sequence.hpp"
+#include "Sequence_WaveletTree.hpp"
+#include "Sequence_RunLength.hpp"
+
+namespace compactds {
+class Sequence_Hybrid: public Sequence
+{
+private:
+  size_t _b ; // block size
+  size_t _blockCnt ;
+  size_t _minAvgRunLength ; // minimum average run length in a block
+  Bitvector_Plain _useRunLength ; // 0-plain sequence, 1-run length sequence 
+  //size_t **_alphabetBlockPartialSum ;
+  Sequence_WaveletTree<Bitvector_Plain> _waveletSeq ;
+  Sequence_RunLength _runlengthSeq ;
+  
+  size_t _blockSizeInferLength ; // use this amount of numbers to infer block size
+
+  size_t EstimateSpace(const FixedSizeElemArray &S, size_t n, size_t b, size_t minRl, int alphabetBit)
+  {
+    size_t i, j ;
+    size_t rlBlockCnt = 0 ; // the number of blocks for run-length representation
+    size_t rlBlockLen = 0 ;
+    size_t runCnt = 0 ; // run count in runlength-endcoed sequence.
+    size_t lastRunChr = 0 ;
+    for (i = 0 ; i < n ; i += b)
+    {
+      uint64_t c = S.Read(i) ;
+      size_t localRunCnt = 1 ;
+      for (j = i + 1 ; j < i + b && j < n ; ++j)
+      {
+        if (S.Read(j) != c)
+        {
+          ++localRunCnt ;
+          c = S.Read(j) ;
+        }
+      }
+      if ((j - i) / localRunCnt >= minRl)
+      {
+        size_t reduce = 0 ;
+        if (S.Read(i) == lastRunChr) 
+          reduce = 1 ;
+        runCnt += localRunCnt - reduce ; 
+        rlBlockLen += (j - i) ; 
+        lastRunChr = c ;
+        ++rlBlockCnt ;
+      }
+    }
+    size_t ret = DIV_CEIL(n, b) + alphabetBit * (n - rlBlockLen) ;
+    
+    if (runCnt > 0)
+      ret += runCnt * Utils::Log2Ceil(n / runCnt) + alphabetBit * runCnt + runCnt * Utils::Log2Ceil(n * 4 / runCnt) ;
+
+    return ret ;
+  }
+
+  // Use the first m characters from S to determine the best block size
+  //   the blocksize shall minimize the block bit overhead and 
+  //   maximize the number of characters that are in the rl-block
+  size_t ComputeBlockSize(const FixedSizeElemArray &S, size_t n, size_t alphabetSize)
+  {
+    size_t i ;
+    int alphabetBit = Utils::Log2Ceil(alphabetSize) ;
+
+    size_t bestSpace = 0 ;
+    size_t bestTag = 0 ;
+    size_t m = (n < _blockSizeInferLength ? n : _blockSizeInferLength) ;
+    for (i = 4 ; i <= m ; i *= 2)
+    {
+      size_t space = EstimateSpace(S, m, i, _minAvgRunLength, alphabetBit) ;
+      if (bestSpace == 0 || space < bestSpace)
+      {
+        bestSpace = space ;
+        bestTag = i ;
+      }
+    }
+
+    if (bestTag <= m)
+    {
+      size_t space = EstimateSpace(S, m, bestTag / 2 * 3, _minAvgRunLength, alphabetBit) ;
+      if (space < bestSpace)
+      {
+        bestSpace = space ;
+        bestTag = bestTag / 2 * 3 ;
+      }
+    }
+    return bestTag ;
+  }
+
+public:
+  Sequence_Hybrid() 
+  {
+    _b = 0 ;
+    _minAvgRunLength = 6 ;
+    _blockSizeInferLength = (1<<20) ;
+  }
+
+  ~Sequence_Hybrid() 
+  {
+    Free() ;
+  }
+  
+  void Free()
+  {
+    if (_n > 0)
+    {
+      //size_t i ;
+      //size_t alphabetSize = _alphabets.GetSize() ;
+      //for (i = 0 ; i < alphabetSize ; ++i) 
+      //  free(_alphabetBlockPartialSum[i]) ;
+      //free(_alphabetBlockPartialSum) ;
+      _n = 0 ;
+    }
+  }
+ 
+  void SetBlockSize(size_t b)
+  {
+    _b = b ;
+  }
+
+  void SetblockSizeInferLength(size_t l)
+  {
+    _blockSizeInferLength = l ;
+  }
+
+  void SetMinAvgRunLength(size_t r)
+  {
+    _minAvgRunLength = r ;
+  }
+
+  size_t GetSpace() 
+  {
+    return _space + sizeof(*this) + _alphabets.GetSpace() - sizeof(_alphabets);
+  }
+
+  void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) 
+  {
+    size_t i, j ;
+
+    _n = sequenceLength ;
+    size_t alphabetSize = _alphabets.GetSize() ;
+    if (alphabetSize == 0)
+    {
+      _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ;
+      alphabetSize = _alphabets.GetSize() ;
+    }
+    //size_t *psums ; // use this to avoid access the rank in another type of array
+    //psums = (size_t *)calloc(alphabetSize, sizeof(size_t)) ;
+    
+    if (_b == 0)
+      _b = ComputeBlockSize(S, sequenceLength, alphabetSize) ;
+
+    _blockCnt = DIV_CEIL(_n, _b) ;
+    size_t runlengthBlockCnt = 0 ; 
+    
+    WORD *B = Utils::MallocByBits(_blockCnt) ; // block indicator 
+    //_alphabetBlockPartialSum = (size_t **)malloc(sizeof(size_t *) * alphabetSize) ;
+    //_space += sizeof(size_t *) * alphabetSize ;
+    /*for (i = 0 ; i < alphabetSize ; ++i)
+    {
+      //_alphabetBlockPartialSum[i] = (size_t *)malloc(sizeof(size_t) * (_blockCnt + 1)) ;
+      //_space += sizeof(size_t) * (_blockCnt + 1) ;
+    }*/
+
+    for (i = 0 ; i < _n ; i += _b)
+    {
+      //for (j = 0 ; j < alphabetSize ; ++j)
+      //  _alphabetBlockPartialSum[j][i / _b] = psums[j] ;
+      
+      int prevc = S.Read(i) ;
+      //++psums[prevc] ;
+      size_t rcnt = 1 ;
+      for (j = 1 ; j < _b && i + j < _n ; ++j)
+      {
+        int c = S.Read(i + j) ;
+        //++psums[c] ;
+        if (c != prevc)
+        {
+          ++rcnt ;
+          prevc = c ;
+        }
+      }
+      if (_b / rcnt >= _minAvgRunLength)
+      {
+        ++runlengthBlockCnt ;
+        Utils::BitSet(B, i / _b) ;
+      }
+    }
+    //for (j = 0 ; j < alphabetSize ; ++j)
+    //  _alphabetBlockPartialSum[j][i / _b] = psums[j] ;
+    _useRunLength.Init(B, _blockCnt) ;
+    
+    // Split the sequence into two parts
+    FixedSizeElemArray tmpS ;
+    tmpS.Malloc(S.GetElemLength(), _n) ;
+    int k ; // use run length
+    for ( k = 0 ; k <= 1 ; ++k)
+    {
+      size_t size = 0 ; 
+      for (i = 0 ; i < _n ; i += _b)
+      {
+        if (Utils::BitRead(B, i / _b) != k)
+          continue ;
+        for (j = 0 ; j < _b && i + j < _n ; ++j)
+        {
+          tmpS.Write(size, S.Read(i + j)) ;
+          ++size ;
+        }
+      }
+      
+      tmpS.SetSize(size) ;
+      //printf("%d %d\n", _b, size) ;
+      if (k == 0)
+      {
+        if (size > 0)
+        {
+          _waveletSeq.SetSelectSpeed( DS_SELECT_SPEED_NO ) ;
+          _waveletSeq.Init(tmpS, size, alphabetMap) ;
+        }
+      }
+      else
+      {
+        if (size > 0)
+          _runlengthSeq.Init(tmpS, size, alphabetMap) ;
+      }
+    }
+    _space += _useRunLength.GetSpace() - sizeof(_useRunLength) ;
+    _space += _waveletSeq.GetSpace() - sizeof(_waveletSeq) ;
+    _space += _runlengthSeq.GetSpace() - sizeof(_runlengthSeq) ;
+    //printf("%d %d %d\n", sizeof(*this), sizeof(_waveletSeq), sizeof(_runlengthSeq)) ;
+
+    //free(psums) ;
+    free(B) ;
+  }
+
+  ALPHABET Access(size_t i) const 
+  {
+    size_t bi = i / _b ;
+    int type = _useRunLength.Access(bi) ;
+    if (type == 0)
+    {
+      size_t r = _useRunLength.Rank(1, bi) ;
+      i -= _b * r ;
+      return _waveletSeq.Access(i) ;
+    }
+    else
+    {
+      size_t r = _useRunLength.Rank(0, bi) ;
+      i -= _b * r ;
+      return _runlengthSeq.Access(i) ;
+    }
+  }
+
+  size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const
+  {
+    if (!inclusive)
+    {
+      if (i == 0)
+        return 0 ;
+      --i ;
+    }
+
+    size_t bi = i / _b ;
+    int type = _useRunLength.Access(bi) ;
+    size_t ranki = _useRunLength.Rank(type, bi) ;
+    size_t otherRanki = (bi + 1) - ranki ;
+     
+    size_t ret = 0 ;
+    size_t typei = (ranki - 1) * _b + i % _b ; // ranki>=1 because bi is of type. 
+    if (type == 0)
+      ret = _waveletSeq.Rank(c, typei) ;
+    else
+      ret = _runlengthSeq.Rank(c, typei) ;
+    if (otherRanki == 0)
+      return ret ;
+    
+    size_t otheri = otherRanki * _b - 1 ;
+    if (type == 0)
+      ret += _runlengthSeq.Rank(c, otheri) ;
+    else
+      ret += _waveletSeq.Rank(c, otheri) ;
+
+    return ret ;
+  }
+
+  size_t Select(ALPHABET c, size_t i) const
+  {
+    return 0 ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Sequence::Save(fp) ;
+    SAVE_VAR(fp, _b) ;
+    SAVE_VAR(fp, _blockCnt) ;
+    SAVE_VAR(fp, _minAvgRunLength) ;
+    _useRunLength.Save(fp) ;
+    _waveletSeq.Save(fp) ;
+    _runlengthSeq.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    
+    Sequence::Load(fp) ;
+    LOAD_VAR(fp, _b) ;
+    LOAD_VAR(fp, _blockCnt) ;
+    LOAD_VAR(fp, _minAvgRunLength) ;
+    _useRunLength.Load(fp) ;
+    _waveletSeq.Load(fp) ;
+    _runlengthSeq.Load(fp) ;
+  }
+  
+  void PrintStats()
+  {
+    Utils::PrintLog("Sequence_Hybrid: total_length: %lu block_size: %lu min_avg_runlength: %lu runlength_block: %lu",
+        _n, _b, _minAvgRunLength, _useRunLength.Rank(1, _blockCnt - 1)) ;
+    _runlengthSeq.PrintStats() ;
+    _waveletSeq.PrintStats() ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Sequence_Permutation.hpp b/compactds/Sequence_Permutation.hpp
new file mode 100644
index 0000000..03137be
--- /dev/null
+++ b/compactds/Sequence_Permutation.hpp
@@ -0,0 +1,70 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE_PERMUTATION
+#define _MOURISL_COMPACTDS_SEQUENCE_PERMUTATION
+
+#include "Utils.hpp"
+#include "Alphabet.hpp"
+#include "FixedSizeElemArray.hpp"
+#include "Sequence.hpp"
+
+namespace compactds {
+class Sequence_Permutation: public Sequence
+{
+private:
+public:
+  void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) 
+  {
+  }
+
+  void Free()
+  {
+  }
+  
+  size_t GetSpace() 
+  {
+  }
+  
+  ALPHABET Access(size_t i) const 
+  {
+    return AccessLong(i) ;
+  }
+
+  size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const
+  {
+    return RankLong(c, i, inclusive) ;
+  }
+
+  size_t Select(ALPHABET c, size_t i) const
+  {
+    return SelectLong(c, i) ;
+  }
+
+  size_t AccessLong(size_t i) const 
+  {
+  }
+  
+  size_t RankLong(size_t c, size_t i, int inclusive = 1) const
+  {
+  }
+
+  size_t SelectLong(size_t c, size_t i) const
+  {
+  }
+
+
+  void Save(FILE *fp)
+  {
+    Sequence::Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Sequence::Load(fp) ;
+  }
+  
+  void PrintStats() 
+  {
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Sequence_Plain.hpp b/compactds/Sequence_Plain.hpp
new file mode 100644
index 0000000..a93e568
--- /dev/null
+++ b/compactds/Sequence_Plain.hpp
@@ -0,0 +1,101 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE_PLAIN
+#define _MOURISL_COMPACTDS_SEQUENCE_PLAIN
+
+#include "Utils.hpp"
+#include "Alphabet.hpp"
+#include "Sequence.hpp"
+
+#include "Bitvector_Plain.hpp"
+#include "Bitvector_RunLength.hpp"
+
+// The sequence representation where each alphabet is a bitvector 
+namespace compactds {
+template <class BvClass>
+class Sequence_Plain: public Sequence
+{
+private:
+  BvClass *_Bvs ; // bitvectors
+  int _selectSpeed ;
+public:
+  Sequence_Plain() 
+  {
+    _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; 
+    _space = 0;
+  }
+
+  ~Sequence_Plain() 
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    delete[] _Bvs ; 
+  }
+
+  size_t GetSpace() 
+  {
+    return _space + _alphabets.GetSpace() - sizeof(_alphabets) + sizeof(*this) ;
+  }
+
+  void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap)
+  {
+    size_t i, j, k ;
+    _space = 0 ;
+    
+    if (_alphabets.GetSize() == 0)
+      _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ;
+    
+    this->_n = sequenceLength ;
+    size_t alphabetSize = _alphabets.GetSize() ;
+    
+    _Bvs = new BvClass[alphabetSize] ;
+    WORD *B = Utils::MallocByBits(_n) ;
+    for (i = 0 ; i < alphabetSize ; ++i)
+    {
+      for (j = 0 ; j < _n ; j += WORDBITS)
+      {
+        WORD w = 0 ;
+        for (k = 0 ; k < WORDBITS && j + k < _n ; ++k)
+        {
+          if (S.Read(j + k) == (int)i)
+            w |= (1ull<<k) ;
+        }
+        B[j / WORDBITS] = w ;
+      }
+      _Bvs[i].SetSelectSpeed( _selectSpeed ) ;
+      _Bvs[i].Init(B, _n) ;
+      _space += _Bvs[i].GetSpace() ;
+    }
+    free(B) ;
+  }
+
+  ALPHABET Access(size_t i) const 
+  {
+    size_t alphabetSize = _alphabets.GetSize() ;
+    size_t j ;
+    for (j = 0 ; j < alphabetSize ; ++j)
+      if (_Bvs[j].Access(i) == 1)
+        return _alphabets.Decode(j, 0) ;
+    return 0 ;
+  }
+  
+  size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const 
+  {
+    int l ;
+    return _Bvs[_alphabets.Encode(c, l)].Rank(1, i, inclusive) ; 
+  }
+
+  size_t Select(ALPHABET c, size_t i) const 
+  {
+    int l ;
+    return _Bvs[_alphabets.Encode(c, l)].Select(i) ; 
+  }
+
+  void PrintStats()
+  {
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Sequence_RunBlock.hpp b/compactds/Sequence_RunBlock.hpp
new file mode 100644
index 0000000..b3a1686
--- /dev/null
+++ b/compactds/Sequence_RunBlock.hpp
@@ -0,0 +1,363 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE_HOMOPOLYMER
+#define _MOURISL_COMPACTDS_SEQUENCE_HOMOPOLYMER
+
+#include <math.h>
+
+#include "Sequence.hpp"
+#include "Sequence_WaveletTree.hpp"
+
+// Split the original sequence into fixed-length blocks,
+//   compress the single-run block by reducing it to one character
+namespace compactds {
+class Sequence_RunBlock: public Sequence
+{
+private:
+  size_t _b ; // block size
+  size_t _blockCnt ;
+  Bitvector_Plain _useRunBlock ; // 0-plain sequence, 1-homo polymer sequence 
+  //size_t **_alphabetBlockPartialSum ;
+  Sequence_WaveletTree<Bitvector_Plain> _waveletSeq ;
+  Sequence_WaveletTree<Bitvector_Plain> _runBlockSeq ;
+
+  // Variables and functions related to automatic block size estimation
+  size_t _blockSizeInferLength ; // use this amount of numbers to infer block size
+  size_t EstimateSpace(const FixedSizeElemArray &S, size_t n, size_t b, int alphabetBit)
+  {
+    size_t i, j ;
+    size_t runBlockCnt = 0 ;
+    size_t runBlockLen = 0 ;
+    for (i = 0 ; i < n ; i += b)
+    {
+      uint64_t c = S.Read(i) ;
+      bool runBlockFlag = true ;
+      for (j = i + 1 ; j < i + b && j < n ; ++j)
+      {
+        if (S.Read(j) != c)
+        {
+          runBlockFlag = false ;
+          break ;
+        }
+      }
+      if (runBlockFlag)
+      {
+        ++runBlockCnt ;
+        runBlockLen += (j - i) ;
+      }
+    }
+    return DIV_CEIL(n, b) + alphabetBit * (runBlockCnt + n - runBlockLen) ; 
+  }
+
+  // Use the first m characters from S to determine block size
+  size_t ComputeBlockSize(const FixedSizeElemArray &S, size_t n, size_t alphabetSize)
+  {
+    size_t i ;
+    int alphabetBit = Utils::Log2Ceil(alphabetSize) ;
+
+    size_t bestSpace = 0 ;
+    size_t bestTag = 0 ;
+    size_t m = (n < _blockSizeInferLength ? n : _blockSizeInferLength) ;
+    for (i = 4 ; i <= m ; i *= 2)
+    {
+      size_t space = EstimateSpace(S, m, i, alphabetBit) ;
+      if (bestSpace == 0 || space < bestSpace)
+      {
+        bestSpace = space ;
+        bestTag = i ;
+      }
+    }
+
+    if (bestTag <= m)
+    {
+      size_t space = EstimateSpace(S, m, bestTag / 2 * 3, alphabetBit) ;
+      if (space < bestSpace)
+      {
+        bestSpace = space ;
+        bestTag = bestTag / 2 * 3 ;
+      }
+      
+      size_t r = 0 ;
+      size_t c = S.Read(0) ;
+      for (i = 1 ; i < m ; ++i)
+      {
+        size_t tmp = S.Read(i) ;
+        if (tmp != c)
+        {
+          ++r ;
+          c = tmp ;
+        }
+      }
+      size_t testSize = CEIL(sqrt((double)m/(double)r)) ;
+      if (testSize > 2)
+      {
+        space = EstimateSpace(S, m, testSize, alphabetBit) ;
+        if (space < bestSpace)
+        {
+          bestSpace = space ;
+          bestTag = testSize ;
+        }
+      }
+    }
+    return bestTag ;
+  }
+
+public:
+  Sequence_RunBlock() 
+  {
+    _b = 0 ;
+    _blockSizeInferLength = (1<<20) ;
+  }
+
+  ~Sequence_RunBlock() 
+  {
+    Free() ;
+  }
+  
+  void Free()
+  {
+    if (_n > 0)
+    {
+      //size_t i ;
+      //size_t alphabetSize = _alphabets.GetSize() ;
+      //for (i = 0 ; i < alphabetSize ; ++i) 
+      //  free(_alphabetBlockPartialSum[i]) ;
+      //free(_alphabetBlockPartialSum) ;
+      _n = 0 ;
+    }
+  }
+ 
+  void SetBlockSize(size_t b)
+  {
+    _b = b ;
+  }
+
+  void SetBlockSizeInferLength(size_t l)
+  {
+    _blockSizeInferLength = l ;
+  }
+  
+  size_t GetSpace() 
+  {
+    bool inclusive = true ;
+    return _space + _alphabets.GetSpace() - sizeof(_alphabets)
+      + (inclusive ? sizeof(*this) : 0) ;
+  }
+
+  void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) 
+  {
+    size_t i, j ;
+
+    _n = sequenceLength ;
+    size_t alphabetSize = _alphabets.GetSize() ;
+    if (alphabetSize == 0)
+    {
+      _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ;
+      alphabetSize = _alphabets.GetSize() ;
+    }
+
+    if (_b == 0)
+      _b = ComputeBlockSize(S, sequenceLength, alphabetSize) ;
+    
+    _blockCnt = DIV_CEIL(_n, _b) ;
+    
+    WORD *B = Utils::MallocByBits(_blockCnt) ; // block indicator 
+
+    for (i = 0 ; i < _n ; i += _b)
+    {
+      int prevc = S.Read(i) ;
+      size_t rcnt = 1 ;
+      for (j = 1 ; j < _b && i + j < _n ; ++j)
+      {
+        int c = S.Read(i + j) ;
+        if (c != prevc)
+        {
+          ++rcnt ;
+          prevc = c ;
+          break ;
+        }
+      }
+      if (rcnt == 1)
+      {
+        Utils::BitSet(B, i / _b) ;
+      }
+    }
+    _useRunBlock.SetSelectSpeed(DS_SELECT_SPEED_NO) ;
+    _useRunBlock.Init(B, _blockCnt) ;
+    
+    // Split the sequence into two parts
+    FixedSizeElemArray tmpS ;
+    tmpS.Malloc(S.GetElemLength(), _n) ;
+    int k ; // use run lbock
+    for ( k = 0 ; k <= 1 ; ++k)
+    {
+      size_t size = 0 ; 
+      for (i = 0 ; i < _n ; i += _b)
+      {
+        if (Utils::BitRead(B, i / _b) != k)
+          continue ;
+        if (k == 0)
+        {
+          for (j = 0 ; j < _b && i + j < _n ; ++j)
+          {
+            tmpS.Write(size, S.Read(i + j)) ;
+            ++size ;
+          }
+        }
+        else
+        {
+          tmpS.Write(size, S.Read(i)) ;
+          ++size ;
+        }
+      }
+      
+      tmpS.SetSize(size) ;
+      //printf("%d %d\n", _b, size) ;
+      if (k == 0)
+      {
+        if (size > 0)
+        {
+          _waveletSeq.SetSelectSpeed( DS_SELECT_SPEED_NO ) ;
+          _waveletSeq.Init(tmpS, size, alphabetMap) ;
+        }
+      }
+      else
+      {
+        if (size > 0)
+        {
+          _runBlockSeq.SetSelectSpeed( DS_SELECT_SPEED_NO ) ;
+          _runBlockSeq.Init(tmpS, size, alphabetMap) ;
+        }
+      }
+    }
+    _space += _useRunBlock.GetSpace() - sizeof(_useRunBlock) ;
+    _space += _waveletSeq.GetSpace() - sizeof(_waveletSeq) ;
+    _space += _runBlockSeq.GetSpace() - sizeof(_runBlockSeq) ;
+    //printf("%d %d %d\n", sizeof(*this), sizeof(_waveletSeq), sizeof(_runBlockSeq)) ;
+
+    free(B) ;
+  }
+
+  ALPHABET Access(size_t i) const 
+  {
+    size_t bi = i / _b ;
+    int type = _useRunBlock.Access(bi) ;
+    if (type == 0)
+    {
+      size_t r = _useRunBlock.Rank(1, bi) ;
+      i -= _b * r ;
+      return _waveletSeq.Access(i) ;
+    }
+    else
+    {
+      size_t r = _useRunBlock.Rank(0, bi) ;
+      i -= _b * r ;
+      return _runBlockSeq.Access(i/_b) ;
+    }
+  }
+
+  size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const
+  {
+    if (!inclusive)
+    {
+      if (i == 0)
+        return 0 ;
+      --i ;
+    }
+
+    size_t bi = i / _b ;
+    int type = _useRunBlock.Access(bi) ;
+    size_t ranki = _useRunBlock.Rank(type, bi) ;
+    size_t otherRanki = (bi + 1) - ranki ;
+     
+    size_t ret = 0 ;
+    if (type == 0)
+      ret = _waveletSeq.Rank(c, (ranki - 1) * _b + i % _b) ; // ranki>=1 because bi is of type.
+    else
+    {
+      bool inRun = true ;
+      size_t rbRank = _runBlockSeq.RankAndTest(c, ranki - 1, inRun) ; // type==1 makes sure ranki >= 1
+      if (inRun) // This makes sure rbRank>=1 at (ranki-1)
+        ret = (rbRank - 1) * _b + i % _b + 1;
+      else
+        ret = rbRank * _b ;
+    }
+
+    if (otherRanki == 0)
+    {
+      return ret ;
+    }
+    if (type == 0)
+      ret += _runBlockSeq.Rank(c, otherRanki - 1) * _b ;
+    else
+      ret += _waveletSeq.Rank(c, otherRanki * _b - 1) ;
+
+    return ret ;
+  }
+
+  size_t Select(ALPHABET c, size_t i) const
+  {
+    return 0 ;
+  }
+
+  void Decompress(FixedSizeElemArray &S)
+  {
+    S.Free() ;
+    
+    size_t i, j ;
+    size_t rbIdx = 0 ;
+    size_t alphabetBit = Utils::Log2Ceil(_alphabets.GetSize()) ; 
+    S.Malloc(alphabetBit, _n) ;
+    for (i = 0 ; i < _n ; i += _b)
+    {
+      size_t k = i / _b ;
+      if (_useRunBlock.Access(k) == 1)
+      {
+        size_t c = _runBlockSeq.Access(rbIdx) ;
+        for (j = i ; j < i + _b && j < _n ; ++j)
+          S.Write(j, c) ;
+        ++rbIdx ;     
+      }
+      else
+      {
+        size_t l = i - _b * rbIdx ; 
+        for (j = i ; j < i + _b && j < _n ; ++j, ++l)
+        {
+          size_t c = _waveletSeq.Access(l) ;
+          S.Write(j, c) ;
+        }
+      }
+    }
+  }
+
+  void Save(FILE *fp)
+  {
+    Sequence::Save(fp) ;
+    SAVE_VAR(fp, _b) ;
+    SAVE_VAR(fp, _blockCnt) ;
+    _useRunBlock.Save(fp) ;
+    _waveletSeq.Save(fp) ;
+    _runBlockSeq.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    
+    Sequence::Load(fp) ;
+    LOAD_VAR(fp, _b) ;
+    LOAD_VAR(fp, _blockCnt) ;
+    _useRunBlock.Load(fp) ;
+    _waveletSeq.Load(fp) ;
+    _runBlockSeq.Load(fp) ;
+  }
+  
+  void PrintStats()
+  {
+    Utils::PrintLog("Sequence_RunBlock: total_length: %lu block_size: %lu runBlock_block: %lu",
+        _n, _b, _useRunBlock.Rank(1, _blockCnt - 1)) ;
+    _runBlockSeq.PrintStats() ;
+    _waveletSeq.PrintStats() ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Sequence_RunLength.hpp b/compactds/Sequence_RunLength.hpp
new file mode 100644
index 0000000..028ac80
--- /dev/null
+++ b/compactds/Sequence_RunLength.hpp
@@ -0,0 +1,191 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE_RUNLENGTH
+#define _MOURISL_COMPACTDS_SEQUENCE_RUNLENGTH
+
+#include "Sequence.hpp"
+#include "Sequence_WaveletTree.hpp"
+#include "PartialSum.hpp"
+#include "SimpleVector.hpp"
+
+// This sequence type assumes the alphabet coder is plain.
+namespace compactds {
+class Sequence_RunLength : public Sequence
+{
+private:
+  Bitvector_Sparse _runs ; // mark the beginning of each runs, E in the manuscript
+  Sequence_WaveletTree<Bitvector_Plain> _runChars ; // the character for each run, supporting ranking, L' in the manuscript
+  PartialSum *_alphabetPartialSum ; // the partial length with respect to each alphabet, D in the manuscript
+  size_t _rcnt ;
+public:
+  Sequence_RunLength() 
+  {
+    _n = _rcnt = 0 ;
+  }
+
+  ~Sequence_RunLength() 
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (_n > 0)
+    {
+      _n = _rcnt = 0 ;
+      delete[] _alphabetPartialSum  ;
+    }
+  }
+
+  size_t GetSpace()
+  {
+    return _space + _alphabets.GetSize() - sizeof(_alphabets) + sizeof(*this) ;
+  }
+  
+  void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) 
+  {
+    size_t i ;
+    uint8_t c ; // character
+  
+    if (_alphabets.GetSize() == 0)
+      _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ;
+
+    // Get the runs
+    _n = sequenceLength ;
+    _rcnt = 1 ;
+    c = S.Read(0) ;
+    for (i = 1 ; i < sequenceLength ; ++i)
+    {
+      if ( S.Read(i) != c)
+      {
+        ++_rcnt ;
+        c = S.Read(i) ;
+      }
+    }
+    
+    FixedSizeElemArray chars ;
+    WORD *W = Utils::MallocByBits(sequenceLength + 2) ;
+    chars.Malloc(S.GetElemLength(), _rcnt) ;
+    
+    c = S.Read(0) ;
+    chars.Write(0, c) ;
+    Utils::BitSet(W, 0) ;
+    _rcnt = 1 ;
+    for (i = 1 ; i < sequenceLength ; ++i)
+    {
+      if (S.Read(i) != c)
+      {
+        c = S.Read(i) ;
+        
+        Utils::BitSet(W, i) ;
+        chars.Write(_rcnt, c) ;
+        ++_rcnt ;
+      }
+    }
+    //Utils::BitSet(W, sequenceLength) ;
+    _runs.Init(W, sequenceLength + 2) ;
+    _runChars.SetSelectSpeed( DS_SELECT_SPEED_NO ) ;
+    _runChars.Init(chars, _rcnt, alphabetMap) ;
+    _space = _runs.GetSpace() - sizeof(_runs) + _runChars.GetSpace() - sizeof(_runChars) ;
+    
+    
+    // Process the runs/partial sums for each alphabet
+    int alphabetSize = _alphabets.GetSize() ;
+    _alphabetPartialSum = new PartialSum[alphabetSize] ;
+    for (c = 0 ; c < alphabetSize ; ++c)
+    {
+      memset(W, 0, Utils::BitsToWords(sequenceLength) * sizeof(WORD)) ;
+      size_t psum = 0 ;
+      Utils::BitSet(W, 0) ;
+      for (i = 0 ; i < sequenceLength ; )
+      {
+        if (S.Read(i) != c)
+        {
+          ++i ;
+          continue ;
+        }
+        size_t j ;
+        for (j = i ; j < sequenceLength ; ++j)
+          if (S.Read(j) != c)
+            break ;
+        psum += j - i ;
+        Utils::BitSet(W, psum) ;
+        i = j ;
+      }
+      _alphabetPartialSum[c].InitFromBitvector(W, psum + 1) ;
+      _space += _alphabetPartialSum[c].GetSpace() - sizeof(_alphabetPartialSum[c]) ;
+    }
+    free(W) ;
+  }
+
+  ALPHABET Access(size_t i) const 
+  {
+    return _runChars.Access(_runs.Rank(1, i) - 1) ;
+  }
+
+  size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const  
+  {
+    if (!inclusive)
+    {
+      if (i == 0)
+        return 0 ;
+      else
+        --i ;
+    }
+
+    size_t cid = _alphabets.Encode(c) ;
+    size_t rrank = _runs.Rank(1, i) ; // rank in runs
+    
+    bool inRun = true ;
+    size_t crank = _runChars.RankAndTest(c, rrank - 1, inRun) ; // rank for this character
+    //printf("%c %d: rrank=%d crank=%d\n", c, i, rrank, crank) ; 
+    if (inRun) 
+    {
+      size_t psum = _alphabetPartialSum[cid].Sum(crank - 1) ;
+      //printf("%d %d. ret=%d\n", psum, _runs.Select(1, rrank),
+      //    psum + i - _runs.Select(1, rrank) + 1) ;
+      return psum + i - _runs.Select(1, rrank) + 1 ;
+    }
+    else
+    {
+      //printf("other %d\n", _alphabetPartialSum[cid].Sum(crank)) ;
+      return _alphabetPartialSum[cid].Sum(crank) ;
+    }
+  }
+
+  // Not supported
+  size_t Select(ALPHABET c, size_t i) const
+  {
+    return 0 ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Sequence::Save(fp) ;  
+    SAVE_VAR(fp, _rcnt) ;
+    _runs.Save(fp) ;
+    _runChars.Save(fp) ;
+    int alphabetSize = _alphabets.GetSize() ;
+    for (int i = 0 ; i < alphabetSize ; ++i)
+      _alphabetPartialSum[i].Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    Sequence::Load(fp) ;
+    LOAD_VAR(fp, _rcnt) ;
+    _runs.Load(fp) ;
+    _runChars.Load(fp) ;
+    int alphabetSize = _alphabets.GetSize() ;
+    _alphabetPartialSum = new PartialSum[alphabetSize] ;
+    for (int i = 0 ; i < alphabetSize ; ++i)
+      _alphabetPartialSum[i].Load(fp) ;
+  }
+
+  void PrintStats()
+  {
+    Utils::PrintLog("Sequence_RunLength: total_length: %lu run count: %lu", _n, _rcnt) ;
+  }
+} ;
+}
+
+#endif 
diff --git a/compactds/Sequence_WaveletTree.hpp b/compactds/Sequence_WaveletTree.hpp
new file mode 100644
index 0000000..440de14
--- /dev/null
+++ b/compactds/Sequence_WaveletTree.hpp
@@ -0,0 +1,338 @@
+#ifndef _MOURISL_COMPACTDS_SEQUENCE_WAVELETTREE
+#define _MOURISL_COMPACTDS_SEQUENCE_WAVELETTREE
+
+#include "Utils.hpp"
+#include "Sequence.hpp"
+
+#include <string.h>
+
+#include "Bitvector_Plain.hpp"
+#include "Bitvector_RunLength.hpp"
+
+namespace compactds {
+template <class BvClass>
+struct _sequence_wavelettree_node
+{
+  BvClass v ;
+  WORD prefix ; // the 
+  int prefixLen ; // bits in prefix
+  int children[2] ;
+
+  void Save(FILE *fp) 
+  {
+    SAVE_VAR(fp, prefix) ;
+    SAVE_VAR(fp, prefixLen) ;
+    SAVE_ARR(fp, children, 2) ;
+    v.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    LOAD_VAR(fp, prefix) ;
+    LOAD_VAR(fp, prefixLen) ;
+    LOAD_ARR(fp, children, 2) ;
+    v.Load(fp) ;
+  }
+} ;
+
+// The implementation of wavelet tree in either
+// perfect balanced or huffman shape,
+// depending on the choice of alphabet.
+template <class BvClass = Bitvector_Plain>
+class Sequence_WaveletTree: public Sequence
+{
+private:
+  struct _sequence_wavelettree_node<BvClass> *_T ; 
+  int _tNodeCnt ;
+  int _selectSpeed ;
+
+  // Based on the pos-th bits (0-index, count from leftside)
+  // maxPosToRight: record the maximum distance from pos to right side. 
+  // return: the number of 1s
+  uint64_t ConvertSequenceToBits(const FixedSizeElemArray &S, const ALPHABET *alphabetMap, int pos, WORD *v, int &maxPosToRight)
+  {
+    size_t i ;
+    size_t n = S.GetSize() ;
+    uint64_t ret = 0;
+    maxPosToRight = 0 ;
+
+    for (i = 0 ; i < n ; ++i)
+    { 
+      int codeLen = 0 ;
+      int b = _alphabets.Encode( alphabetMap[S.Read(i)], codeLen ) ;
+      if (codeLen - pos > maxPosToRight)
+        maxPosToRight = codeLen - pos ;
+      if (b & (1<<(codeLen - pos - 1)))
+      {
+        Utils::BitSet(v, i) ; // Assume the array is already initated to be all 0
+        ++ret ;
+      }
+    }
+    return ret ;
+  }
+  
+  // Assume left and right's memory has been allocated.
+  void SplitSequence(const FixedSizeElemArray &orig, WORD *v,
+      FixedSizeElemArray &left, FixedSizeElemArray &right)
+  {
+    size_t i ;
+    size_t len = orig.GetSize() ;
+   
+    size_t leftLen = 0 ;
+    size_t rightLen = 0 ;
+    for (i = 0 ; i < len ; ++i)
+    {
+      if (!Utils::BitRead(v, i))
+      {
+        left.Write(leftLen, orig.Read(i)) ; 
+        ++leftLen ;
+      }
+      else
+      {
+        right.Write(rightLen, orig.Read(i)) ;
+        ++rightLen ;
+      }
+    }
+  }
+
+  // The recursive function that construct the tree.
+  // Assumes that the leaf node always has the brother.
+  // depth: how many bits has been processed so far
+  // tused: the number of wavelet tree node used so far
+  // bufferv: preallcoated memory to holding temporary bit array
+  // return: node id (index in T)
+  int BuildTree(const FixedSizeElemArray &S, const ALPHABET *alphabetMap, int depth, WORD prefix, WORD *bufferv)
+  {
+    size_t len = S.GetSize() ;
+    int ti = _tNodeCnt ;
+    ++_tNodeCnt ;
+    int remainingBits ;
+    
+    memset(bufferv, 0, Utils::BitsToWordBytes(len)) ;
+    uint64_t onecnt = ConvertSequenceToBits(S, alphabetMap, depth, bufferv, remainingBits) ;
+    _T[ti].v.SetSelectSpeed(_selectSpeed) ;
+    _T[ti].v.Init(bufferv, len) ;
+    _space += _T[ti].v.GetSpace() - sizeof(_T[ti].v) ;
+    _T[ti].prefix = prefix ;
+    _T[ti].prefixLen = depth ;
+    if (remainingBits == 1 || S.GetSize() == 0)
+    {
+      // Reach leaf.
+      _T[ti].children[0] = _T[ti].children[1] = -1 ;
+      return ti ;
+    }
+    FixedSizeElemArray leftS, rightS ; // the memory should be automatically released
+    leftS.Malloc(S.GetElemLength(), len - onecnt) ;
+    rightS.Malloc(S.GetElemLength(), onecnt) ;
+    SplitSequence(S, bufferv, leftS, rightS) ;
+
+    _T[ti].children[0] = BuildTree(leftS, alphabetMap, depth + 1, prefix << 1, bufferv) ;
+    _T[ti].children[1] = BuildTree(rightS, alphabetMap, depth + 1, (prefix << 1) | 1ull, bufferv) ;
+
+    return ti ;
+  }
+  
+  int AccessInNode(int ti, size_t i) const
+  {
+    return _T[ti].v.Access(i) ;
+  }
+
+  // Calculate the rank(type, i) in T[ti]
+  size_t RankInNode(int ti, int type, size_t i, int inclusive = 1) const
+  {
+    return _T[ti].v.Rank(type, i, inclusive) ;
+  }
+
+  size_t SelectInNode(int ti, int type, size_t i) const
+  {
+    return _T[ti].v.Select(type, i);
+  }
+
+  // Recursive function for Select
+  // c: the code for the alphabet.
+  // l: the length of the code
+  // i: the select we want to query. The ith chracter c.
+  // ti: tree node idx
+  // depth: the recursive depth
+  size_t RecursiveSelect(WORD c, int l, size_t i, int ti, int depth ) const
+  {
+    int b = (c >> (l-depth-1)) & 1 ;
+    if (depth >= l - 1)
+    {
+      return SelectInNode(ti, b, i) ;
+    }
+    
+    // Need the +1 to convert the index from Select to the rank as the input to Select
+    return SelectInNode(ti, b, RecursiveSelect(c, l, i, _T[ti].children[b], depth + 1) + 1 ) ;
+  }
+public:
+  Sequence_WaveletTree() 
+  {
+    _tNodeCnt = 0 ;
+    _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ;
+  }
+
+  ~Sequence_WaveletTree() {Free() ;}
+  
+  void Free() 
+  {
+    if (_tNodeCnt)
+    {
+      delete[] _T ;
+      _T = NULL ;
+      _tNodeCnt = 0 ;
+    }
+  }
+
+  void SetSelectSpeed(int speed)
+  {
+    _selectSpeed = speed ;
+  }
+
+  size_t GetSpace() {return _space + _alphabets.GetSpace() - sizeof(_alphabets) + sizeof(this) ;} 
+  
+  // We compactly represent the input sequence as fixed-size element array in a plain fashion
+  // just to save some memory when construct the tree.
+  void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap)
+  {
+    _space = 0 ;
+    this->_n = sequenceLength ;
+    
+    if (_alphabets.GetSize() == 0)
+      _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ;
+     
+    _T = new struct _sequence_wavelettree_node<BvClass>[_alphabets.GetAlphabetCapacity() - 1] ; 
+    _tNodeCnt = 0 ;
+    _space += sizeof(*_T) * (_alphabets.GetAlphabetCapacity() - 1) ;
+    
+    WORD *bufferv = Utils::MallocByBits(sequenceLength) ; 
+    BuildTree(S, alphabetMap, 0, 0, bufferv) ;
+    free(bufferv) ;
+  }
+
+  // Return: the alphabet at position i.
+  ALPHABET Access(size_t i) const 
+  {
+    int l = 0 ;
+    WORD code = 0 ;
+    int ti = 0 ;
+    for (l = 0 ; ti != -1 ; ++l)
+    {
+      int b = AccessInNode(ti, i) ;
+      code = (code << 1) | b ;
+      // Need -1 to convert the rank number to array index.
+      // There is no need to check the negativity from -1,
+      // because we know the current bit is 0, so rank>=1.
+      //i = _T[ti].v.Rank(b, i) - 1 ;
+      i = RankInNode(ti, b, i) - 1 ;
+      ti = _T[ti].children[b] ;
+    }
+    return _alphabets.Decode(code, l) ;
+  }
+
+  // Return: the number of alphabet c's in [0..i]  
+  size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const 
+  {
+    int l = 0 ; // l: the length of the code
+    WORD code = _alphabets.Encode(c, l) ;
+    int depth = 0 ;
+    int ti = 0 ;
+    if (!inclusive) // Since in the wavelet tree, the non-inclusive operation should only
+        // happen in the leaf node, so we directly modify i here for simplicity.
+    {
+      if (i == 0)
+        return 0 ;
+      else
+        --i ;
+    }
+    for (depth = 0 ; depth < l ; ++depth)
+    {
+      int b = (code >> (l - depth - 1)) & 1 ;
+      
+      //i = _T[ti].v.Rank(b, i) ;
+      i = RankInNode(ti, b, i) ;
+      
+      if (i == 0 || depth == l - 1)
+        break ;
+      // R count the number of 1's (or 0's), so we need to -1 to change it to 0-based index
+      //  as in the bitvector.
+      --i ; 
+      ti = _T[ti].children[b] ;
+    }
+    return i ;
+  }
+
+  // Return: rank of c in [0..i] (inclusive), 
+  //  also test whether T[i]==c, return through isC
+  size_t RankAndTest(ALPHABET c, size_t i, bool &isC) const
+  { 
+    int l = 0 ; // l: the length of the code
+    WORD code = _alphabets.Encode(c, l) ;
+    int depth = 0 ;
+    int ti = 0 ;
+
+    isC = true ;
+    for (depth = 0 ; depth < l ; ++depth)
+    {
+      int b = (code >> (l - depth - 1)) & 1 ;
+      if (isC && b != AccessInNode(ti, i))
+        isC = false ;
+
+      //i = _T[ti].v.Rank(b, i) ;
+      i = RankInNode(ti, b, i) ;
+
+      if (i == 0 || depth == l - 1)
+        break ;
+      // R count the number of 1's (or 0's), so we need to -1 to change it to 0-based index
+      //  as in the bitvector.
+      --i ; 
+      ti = _T[ti].children[b] ;
+    }
+    return i ;
+  }
+
+
+
+  // return: the index of the ith (1-based) c 
+  size_t Select(ALPHABET c, size_t i) const 
+  {
+    int l = 0 ;
+    WORD code = _alphabets.Encode(c, l) ;
+    return RecursiveSelect(code, l, i, 0, 0) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Sequence::Save(fp) ;
+    SAVE_VAR(fp, _tNodeCnt) ;
+    SAVE_VAR(fp, _selectSpeed) ;
+    int i ;
+    for (i = 0 ; i < _tNodeCnt ; ++i)
+      _T[i].Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    Sequence::Load(fp) ;
+    LOAD_VAR(fp, _tNodeCnt) ;
+    LOAD_VAR(fp, _selectSpeed) ;
+
+    if (_alphabets.GetSize() == 0) //empty tree
+      return ;
+
+    _T = new struct _sequence_wavelettree_node<BvClass>[_tNodeCnt] ; 
+    int i ;
+    for (i = 0 ; i < _tNodeCnt ; ++i)
+      _T[i].Load(fp) ;
+  }
+
+  void PrintStats()
+  {
+    Utils::PrintLog("Sequence_WaveletTree: total_length: %lu node_cnt: %lu", _n, _tNodeCnt) ;
+  }
+
+} ;
+}
+
+#endif
diff --git a/compactds/SimpleVector.hpp b/compactds/SimpleVector.hpp
new file mode 100644
index 0000000..8f51070
--- /dev/null
+++ b/compactds/SimpleVector.hpp
@@ -0,0 +1,388 @@
+#ifndef _LSONG_SIMPLE_VECTOR_HEADER
+#define _LSONG_SIMPLE_VECTOR_HEADER
+
+// A light version of vector, which increase the size of the array by 
+// a value no more than specified if it got overflow.
+// And the type of elements is basic.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+//const int maxInc = -1 ;
+
+template <class T>
+class SimpleVector
+{
+private:
+	int size ;
+	int capacity ;
+	int maxInc ; // The maximal value we can use to increase the capacity.
+	int inc ;
+	T *s ;
+public:
+	SimpleVector() : maxInc( -1 )
+	{ 
+		s = NULL ;
+		size = capacity = 0 ;
+		inc = 1 ;
+	}
+	
+	SimpleVector( int mi ): maxInc( mi ) 
+	{ 
+		s = NULL ;
+		size = capacity = 0 ;
+		inc = 1 ;
+	}
+
+	SimpleVector( const SimpleVector &in )
+	{
+		size = in.size ;
+		capacity = in.capacity ;
+		if ( capacity > 0 )
+		{
+			//s = in.s ;
+			//if ( in.s == NULL )
+			//	printf( "null s. %d %d\n", in.size, in.capacity ) ;
+			s = (T *)malloc( sizeof( T ) * capacity ) ;
+			memcpy( s, in.s, sizeof( T ) * capacity ) ;
+		}
+		else 
+			s = NULL ;
+		inc = in.inc ;
+		maxInc = in.maxInc ;
+	}
+	
+	SimpleVector& operator=( const SimpleVector &in )
+	{
+		if ( this != &in )
+		{
+			if ( s != NULL )
+				free( s ) ;
+			size = in.size ;
+			capacity = in.capacity ;
+
+			if ( capacity > 0 )
+			{
+				//s = in.s ;
+				s = (T *)malloc( sizeof( T ) * capacity ) ;
+				memcpy( s, in.s, sizeof( T ) * capacity ) ;
+			}
+			else 
+				s = NULL ;
+
+			inc = in.inc ;
+			maxInc = in.maxInc ;
+		}
+		return *this ;
+	}
+
+	~SimpleVector()
+	{
+		if ( s != NULL )
+			free( s ) ;
+		capacity = 0 ;
+		size = 0 ;
+	}
+	
+	void Release()
+	{
+		if ( s != NULL )
+			free( s ) ;
+		s = NULL ;
+		size = capacity = 0 ;
+	}
+
+	void Reserve( int sz )
+	{
+		if ( s != NULL )
+			free( s ) ;
+		s = (T *)malloc( sizeof( T ) * sz ) ;
+		size = 0 ;
+		capacity = sz ;
+		inc = sz ;
+
+		if ( maxInc > 0 && inc > maxInc )
+			inc = maxInc ;
+	}
+
+
+	int PushBack( const T &in )	
+	{
+		if ( size == capacity )
+		{
+			//int tmp = capacity ;
+			capacity += inc ;
+			inc *= 2 ;
+			if ( maxInc > 0 && inc > maxInc )
+				inc = maxInc ;
+			if ( size == 0 )
+				s = (T *)malloc( sizeof( T ) * capacity ) ;
+			else
+				s = (T *)realloc( s, sizeof( T ) * capacity ) ;
+			if ( s == NULL ) 
+			{
+				fprintf( stderr, "%s: Failed to allocate memory.\n", __func__ ) ;
+				exit( 1 ) ;
+			}
+		}
+		s[ size ] = in ;
+		++size ;
+		return size ;
+	}
+
+	int PushBack( const SimpleVector<T> &in )
+	{
+		int newsize = size + in.size ;
+		if ( newsize > capacity )
+		{
+			//int tmp = capacity ;
+			capacity = newsize + inc ;
+			inc *= 2 ;
+			if ( maxInc > 0 && inc > maxInc )
+				inc = maxInc ;
+			if ( size == 0 )
+				s = (T *)malloc( sizeof( T ) * capacity ) ;
+			else
+				s = (T *)realloc( s, sizeof( T ) * capacity ) ;
+			if ( s == NULL ) 
+			{
+				fprintf( stderr, "%s: Failed to allocate memory.\n", __func__ ) ;
+				exit( 1 ) ;
+			}
+		}
+		memcpy( s + size, in.s, sizeof( T ) * in.size ) ;
+		size = newsize ;
+		return size ;
+	}
+
+	T PopBack()
+	{
+		if ( size == 0 )
+		{
+			fprintf( stderr, "%s: empty array.\n", __func__ ) ;
+			exit( 1 ) ;
+		}
+		--size ;
+		return s[size] ;
+	}
+	
+	int GetInc()
+	{
+		return inc ;
+	}
+
+	void SetInc( int in )
+	{
+		inc = in ;
+	}
+
+	void SetMaxInc( int in )
+	{
+		maxInc = in ;
+	}
+	int GetMaxInc()
+	{
+		return maxInc ;
+	}
+	int Size() const
+	{
+		return size ;
+	}
+
+	int Resize( int s ) 
+	{
+		size = s ;
+		return size ;
+	}
+
+	int Capacity()
+	{
+		return capacity ;
+	}
+
+	T &Get( int i )
+	{
+		if ( i >= size )
+		{
+			fprintf( stderr, "%s: Access out of the vector.\n", __func__ ) ;
+			exit( 1 ) ;
+		}
+		return s[i] ;
+	}
+
+	T &operator[]( int i ) const
+	{
+		/*if ( i >= size )
+		{
+			printf( "ERROR\n" ) ;
+		}*/
+		//assert( i < size ) ;
+		/*if ( i >= size )
+		{
+			fprintf( stderr, "%s: Access out of the vector.\n", __func__ ) ;
+			exit( 1 ) ;
+		}*/
+		return s[i] ;
+	}
+	
+	// Return how many element left.
+	int Remove( int ind )
+	{
+		int i ;
+		if ( ind >= size )
+		{
+			fprintf( stderr, "%s: Access out of the vector.\n", __func__ ) ;
+			exit( 1 ) ;
+		}
+
+		//if ( size == 1 )
+		//	return 0 ;
+		for ( i = ind ; i < size - 1 ; ++i )
+			s[i] = s[i + 1] ;
+		--size ;
+		return size ;
+	}
+
+	// Allocate less memory. 
+	int Shrink()
+	{
+		if ( size < capacity / 4 )
+		{
+			capacity /= 2 ;
+			inc = capacity ;
+			if ( inc > maxInc )
+				inc = maxInc ;
+			s = (T *)realloc( s, sizeof( T ) * capacity ) ;				
+		}
+		return capacity ;
+	}
+
+	void Clear()
+	{
+		size = 0 ;
+	}
+
+	void QSort( int (*compare)(const void*,const void*) )
+	{
+		qsort( s, size, sizeof( T ), compare ) ;
+	}
+
+	int BinarySearch( const T &v )
+	{
+		int l, r, m ;
+		l = 0 ; 
+		r = size - 1 ;
+
+		while ( l <= r )
+		{
+			m = ( l + r ) / 2 ;
+			if ( s[m] == v )
+				return m ;
+			else if ( s[m] < v )
+				l = m + 1 ;
+			else
+				r = m - 1 ;
+
+		}
+		return l - 1 ; // Should be between  l - 1 and l
+	}
+
+	void Destroy()
+	{
+		if ( s != NULL )
+			free( s ) ;
+		s = NULL ;
+		size = capacity = 0 ;
+		inc = 1 ;
+	}
+
+	void Overwrite( const SimpleVector<T> &in )
+	{
+		if ( s != NULL )
+			free( s ) ;
+		s = NULL ;
+		if ( in.s != NULL )
+			s = (T *)malloc( sizeof( T ) * in.capacity ) ;
+		size = in.size ;
+		capacity = in.capacity ;
+		inc = in.inc ;
+		int i ;
+		for ( i = 0 ; i < size ; ++i )
+			s[i] = in.s[i] ;
+	}
+
+	void Reverse()
+	{
+		int i, j ;
+		T tmp ;
+		for ( i = 0, j = size - 1 ; i < j ; ++i, --j )
+		{
+			tmp = s[j] ;
+			s[j] = s[i] ;
+			s[i] = tmp ;
+		}
+	}
+	
+	// Expand the array by given size.
+	// Does not care about the value in the new allocated space.
+	int ExpandBy( int expandSize )
+	{
+		int newSize = size + expandSize ;
+		if ( newSize <= capacity )
+		{
+			size = newSize ;
+		}
+		else
+		{
+			//int tmp = capacity ;
+			capacity = newSize + inc ;
+			inc *= 2 ;
+			if ( maxInc > 0 && inc > maxInc )
+				inc = maxInc ;
+			if ( size == 0 )
+				s = (T *)malloc( sizeof( T ) * capacity ) ;
+			else
+				s = (T *)realloc( s, sizeof( T ) * capacity ) ;
+			if ( s == NULL ) 
+			{
+				fprintf( stderr, "%s: Failed to allocate memory.\n", __func__ ) ;
+				exit( 1 ) ;
+			}
+			size = newSize ;
+		}
+		return size ;
+	}
+
+	int ExpandTo( int newSize )
+	{
+		return ExpandBy( newSize - size ) ;
+	}
+
+	void ShiftRight( int shift )
+	{
+		size = ExpandBy( shift ) ;
+		int i ;
+
+		for ( i = size - 1 ; i >= shift ; --i )
+			s[i] = s[i - shift] ;
+		return ;
+	}
+	
+	// Set the content to zero in the range
+	void SetZero( int start, int len )
+	{
+		memset( s + start, 0, sizeof( T ) * len ) ;
+	}
+
+	T *BeginAddress()
+	{
+		return s ;
+	}
+	T *EndAddress() 
+	{
+		return s + size ;
+	}
+} ;
+
+#endif
diff --git a/compactds/SuffixArrayGenerator.hpp b/compactds/SuffixArrayGenerator.hpp
new file mode 100644
index 0000000..8d52a13
--- /dev/null
+++ b/compactds/SuffixArrayGenerator.hpp
@@ -0,0 +1,725 @@
+#ifndef _MOURISL_COMPACTDS_SUFFIXARRAY_GENERATOR
+#define _MOURISL_COMPACTDS_SUFFIXARRAY_GENERATOR
+
+#include <vector>
+
+#include "FixedSizeElemArray.hpp"
+#include "DifferenceCover.hpp"
+
+// The class handle the generation of suffix array by chunks
+// The chunk creation is based the sampled difference cover (Algorithm 11.9 is commented out)
+namespace compactds {
+class SuffixArrayGenerator
+{
+private:
+  size_t _n ;
+  size_t _space ;
+  size_t _alphabetSize ;
+  
+  // The variables relate to generate the boundaries/_cuts 
+  size_t _b ;
+  size_t* _cuts ; // The index on T  
+  size_t _cutCnt ;
+  size_t **_cutLCP ; // self LCP for each cut
+
+  // The variables relate to the difference cover and its ISAs
+  DifferenceCover _dc ;
+  size_t *_dcISA ; // The difference cover's index should be compacted when query this ISA 
+  size_t _dcSize ; 
+  
+  // Relate to cut ============================================  
+#if 0 // The commented out codes is for Algorithm 11.9, which might be too slow for very repetitive sequence (i.e: ACGTACGTACGT....), so we have another implementation now
+
+  // The functions relate to generate cut
+  int SuffixCompareCutString(const FixedSizeElemArray &T, size_t n, size_t i, const FixedSizeElemArray &s, size_t k) 
+  {
+    if (k == 0)
+      return 0 ;
+    else
+      return T.SubrangeCompare(i, i + k - 1, s, 0, k - 1) ;
+  }
+ 
+  // Count the size for each alphabet following current cut string.
+  // s: the cut string
+  // k: length of the cut string
+  void CountCutExtension(const FixedSizeElemArray &T, size_t n, const FixedSizeElemArray &s, size_t k, size_t *alphabetCounts)
+  {
+    size_t i ;
+    memset(alphabetCounts, 0, sizeof(alphabetCounts[0]) * _alphabetSize) ;
+    for (i = 0 ; i < n - k ; i += downsample)
+    {
+      if (!SuffixCompareCutString(T, n, i, s, k))
+        alphabetCounts[ T.Read(i + k) ] += downsample ;
+    }
+  }
+  
+  // s: the cut string
+  // k: length of the cut string
+  void ExpandInCut(const FixedSizeElemArray &T, size_t n, FixedSizeElemArray &s, size_t k, size_t *chunkLens)
+  {
+    size_t* alphabetCounts = (size_t *)malloc(sizeof(size_t) * _alphabetSize) ;
+    CountCutExtension(T, n, s, k, alphabetCounts) ;
+    size_t c ;
+    for (c = 0 ; c < _alphabetSize ; ++c)
+    {
+      s.Write(k, c) ; 
+      if (alphabetCounts[c] <= b)
+      {
+        if (chunkLens[_cutCnt - 1] + alphabetCounts[c] > b)
+        {
+          _cuts[_cutCnt].InitFromOtherPrefix(s, k + 1) ;
+          ++_cutCnt ;
+          chunkLens[_cutCnt - 1] = 0 ;
+        }
+        chunkLens[_cutCnt - 1] += alphabetCounts[c] ;
+      }
+      else
+        ExpandInCut(T, n, s, k + 1, chunkLens) ;
+    }
+    free(alphabetCounts) ;
+  }
+  
+  size_t GenerateCuts(const FixedSizeElemArray &T, size_t n)
+  {
+    size_t m = DIV_CEIL(2 * n, b) ;
+    _cuts = new FixedSizeElemArray[m + 1] ;
+    size_t* chunkLens = (size_t *)malloc(sizeof(size_t) * (m + 1)) ;
+    FixedSizeElemArray s ;
+    s.Malloc(T.GetElemLength(), _n < _b ? _n : b) ;
+   
+    _cuts[0].Malloc(T.GetElemLength(), 0) ;
+    chunkLens[0] = 0 ;
+    _cutCnt = 1 ;
+    ExpandInCut(T, n, s, 0, chunkLens) ;
+    _cuts[_cutCnt].Malloc(T.GetElemLength(), 0) ;
+    printf("%d\n", _cuts[1].GetSize()) ;
+    free(chunkLens) ;
+    return _cutCnt ;
+  }
+#endif
+
+  // TODO: handle the case where we don't generate difference cover
+  size_t GenerateCuts(size_t *_dcSA)
+  {
+    size_t i ;
+    size_t blockCnt = DIV_CEIL(_n, _b) ;
+    size_t stride = DIV_CEIL(_dcSize, blockCnt) ;
+    blockCnt = DIV_CEIL(_dcSize, stride) ;
+    _cutCnt = blockCnt ;
+    _cuts = (size_t *)malloc(sizeof(size_t) * (_cutCnt + 1)) ;
+    _cuts[0] = _n ;
+    for (i = stride ; i < _dcSize ; i += stride)
+    {
+      //printf("_cuts %d = %d\n", i / stride, _dcSA[i]) ;
+      _cuts[i / stride] = _dcSA[i] ;
+    }
+    _cuts[blockCnt] = _n ;
+    return _cutCnt ;
+  }
+
+  // For each cut s, compute LCP(s, s[i:]) for i <= maxSize
+  void ComputeCutLCP(const FixedSizeElemArray &T, size_t n, size_t maxSize) 
+  {
+    size_t i, j, l ;
+    _cutLCP = (size_t **)malloc(sizeof(*_cutLCP) * _cutCnt) ;
+    for (i = 0 ; i < _cutCnt ; ++i)
+    {
+      size_t jopenend = n - _cuts[i] ;
+      if (jopenend > maxSize )
+        jopenend = maxSize ;
+      _cutLCP[i] = (size_t *)malloc(sizeof(_cutLCP[i][0]) * jopenend) ;
+      if (jopenend == 0)
+        continue ;
+      _cutLCP[i][0] = jopenend ;
+      for (j = 1 ; j < jopenend ; ++j) 
+      {
+        for (l = 0 ; l < jopenend - j ; ++l)
+        {
+          if (T.Read(_cuts[i] + l) != T.Read(_cuts[i] + j + l))
+            break ;
+        }
+        _cutLCP[i][j] = l ;
+      }
+    }
+  }
+  
+  // Compare the T[i,...] with a cut ci, and adjust other auxiliary data relating
+  // rightmosti: the start position corresponding to the rightmost j 
+  // rightj: the rightmost end position
+  //   The auxiliary data is for reusing some information from the 
+  //   preiouv T[i-1,...] comparison
+  // return: sign represent T[i:n]-cut . 
+  int CompareCutUsingCutLCP(const FixedSizeElemArray &T, size_t n, size_t i, size_t ci, 
+      size_t &rightmosti, size_t &rightmostj)
+  {
+    size_t j ; // position on T
+    size_t cpos ; // position on cut
+    size_t overlap = 0 ;
+    size_t cutLen = n - _cuts[ci] ;
+    if (i == _cuts[ci])
+      return 0 ; // return 0 for equal
+    //if (i == 15 && ci == 6)
+    //  printf("> %d %d %d\n", i, rightmosti, rightmostj) ;
+
+    if (cutLen > (size_t)_dc.GetV())
+      cutLen = _dc.GetV() ;
+    if (rightmostj > 0 && i <= rightmostj)  // i<=rightmostj? 
+    {
+      overlap = _cutLCP[ci][i - rightmosti] ;  
+      if (rightmostj <= i + overlap - 1) // we may need to update the range 
+      {
+        /*for (j = rightmostj + 1, cpos = rightmostj - i + 1 ; 
+            j < n && cpos < cutLen; ++j, ++cpos)
+        {
+          if (T.Read(j) != T.Read(_cuts[ci] + cpos))
+            break ;
+        }
+        --j ; --cpos ; 
+        rightmostj = j ;
+        rightmosti = i ;*/
+        size_t localMatchCnt = T.PrefixMatchLen(rightmostj + 1, n - 1, 
+            T, _cuts[ci] + rightmostj - i + 1, _cuts[ci] + cutLen - 1) ;
+        rightmosti = i ;
+        j = (rightmostj + 1) + localMatchCnt - 1 ;
+        rightmostj = j ;
+        cpos = j - i ;
+      }
+      else
+      {
+        return (int)T.Read(i + overlap) - T.Read(_cuts[ci] + overlap) ;
+      }
+    }
+    else
+    {
+      /*for (j = i, cpos = 0 ; j < n && cpos < cutLen ; ++j, ++cpos)
+        if (T.Read(j) != T.Read(_cuts[ci] + cpos))
+          break ;
+      //printf("%d %d %d. %d\n", i, ci, _cuts[ci], cpos) ;
+      if (cpos == 0)
+        return T.Read(j) - T.Read(_cuts[ci] + cpos) ;
+
+      --j; --cpos ;
+      rightmostj = j ;
+      rightmosti = i ;*/
+
+      size_t localMatchCnt = T.PrefixMatchLen(i, n - 1, 
+          T, _cuts[ci], _cuts[ci] + cutLen - 1) ;
+      if (localMatchCnt == 0)
+        return T.Read(i) - T.Read(_cuts[ci]) ;
+      j = i + localMatchCnt - 1 ;
+      rightmostj = j ;
+      cpos = localMatchCnt - 1 ;
+      rightmosti = i ;
+    }
+    
+    if (j == n - 1 && cpos == n - _cuts[ci] - 1)
+      return 0 ;
+    else if (j == n - 1)
+      return -1 ;
+    else if (cpos == n - _cuts[ci] - 1)
+      return 1 ;
+    else
+    {
+      int cmp = T.Read(j + 1) - T.Read(_cuts[ci] + cpos + 1) ;
+      if (cmp == 0)
+        return CompareSuffixWithDC(i, _cuts[ci], n) ;
+      else
+        return cmp ;
+    }
+  }
+
+  // Relate to suffix sorting ============================================  
+  void Swap(size_t &a, size_t &b)
+  {
+    size_t tmp ;
+    tmp = a ; a = b ; b = tmp ;
+  }
+
+  // Compare T[a:] and T[b:] directly with DC, which assumes their first
+  //  v prefix are matched
+  // @return: sign(T[a:]-T[b:])
+  int CompareSuffixWithDC(size_t a, size_t b, size_t n)
+  {    
+    if (a == b)
+      return 0 ;
+    int delta = _dc.Delta(a, b) ;
+    int compare = 0 ;
+    if (a + delta >= n)
+      compare = 1 ;
+    else if (b + delta >= n)
+      compare = -1 ;
+    else
+    {
+      size_t aisa = _dcISA[ _dc.CompactIndex(a + delta) ] ;
+      size_t bisa = _dcISA[ _dc.CompactIndex(b + delta) ] ;
+      if (aisa < bisa)
+        compare = -1 ;
+      else 
+        compare = 1 ;
+    }
+    return compare ;
+  }
+
+  // Use difference cover to quick sort the SA. We don't need to pass T now  
+  void QSortWithDC(size_t *sa, size_t m, size_t s, size_t e, size_t n)
+  {
+    if (s >= e)
+      return ;
+    // Partition
+    Swap(sa[s], sa[(s + e)/2]) ;
+    size_t pivot = sa[s] ; // pivot is the median element
+    size_t pi, pj ; // partiation indexes
+    pi = s + 1; // pi points to the current process element
+    pj = e + 1 ; // pj points the first element of the second chunk
+    while (pi < pj)
+    {
+      int comparePivot = CompareSuffixWithDC(sa[pi], pivot, n) ;
+
+      if (comparePivot < 0)
+        ++pi ;
+      else
+      {
+        Swap(sa[pi], sa[pj - 1]) ;
+        --pj ;
+      }
+    }
+    Swap(sa[s], sa[pi - 1]) ;
+    if (pi > 2)
+      QSortWithDC(sa, m, s, pi - 2, n) ;
+    QSortWithDC(sa, m, pi, e, n) ;
+  }
+
+  // Sort T[s..e], and only consider the positions in sa of size m 
+  // s, e: the range for sa
+  // d: the preifx already matched in T[s..e], kind of as depth.
+  // dcStrategy: how to use the difference cover. 0-no _dc, 1-use _dc, 2-return when reach _dcv
+  void MultikeyQSort(const FixedSizeElemArray &T, size_t n, size_t *sa, size_t m, size_t s, size_t e, size_t d, int dcStrategy, size_t *alphabetCounts)
+  {
+    if (s >= e)
+      return ;
+    if (dcStrategy != 0 && d >= (size_t)_dc.GetV())
+    {
+      if (dcStrategy == 2)
+        return ;
+      else if (dcStrategy == 1)
+      {
+        // We now use compare the suffix using difference cover
+        QSortWithDC(sa, m, s, e, n) ;
+        return ;
+      }
+    }
+
+    size_t i ;
+    size_t tmp ;
+    // Find pivot
+    size_t pivot = 0 ;
+    
+    // quick check whether every suffix is the same using blocks
+    const int alphabetBits = T.GetElemLength() ;
+    const int block = WORDBITS / alphabetBits ;
+    while (1)
+    {
+      if (dcStrategy != 0 && d >= (size_t)_dc.GetV())
+        break ;
+      bool passEnd = false ; // any suffix pass the end of the T
+      WORD foundw = 0 ;
+      if (sa[s] + d + block - 1 < n)
+        foundw = T.PackRead(sa[s] + d, block) ;
+      else
+        passEnd = true ;
+      for (i = s + 1 ; i <= e ; ++i)
+      {
+        if (sa[i] + d + block - 1 < n)
+        {
+          WORD w = T.PackRead(sa[i] + d, block) ;
+          if (w != foundw)
+            break ;
+        }
+        else
+        {
+          if (passEnd == false)
+            break ;
+        }
+      }
+      if (i > e)
+        d += block ;
+      else
+        break ;
+    }
+
+    // Real search
+    while (1)
+    {
+      if (dcStrategy != 0 && d >= (size_t)_dc.GetV())
+        break ;
+      
+      memset(alphabetCounts, 0, sizeof(alphabetCounts[0]) * (_alphabetSize + 1)) ;
+      for (i = s ; i <= e ; ++i)
+      {
+        if (sa[i] + d < n)
+          ++alphabetCounts[T.Read(sa[i] + d) + 1] ;
+        else
+          ++alphabetCounts[0] ;
+      }
+      if (alphabetCounts[0] == e - s + 1)
+        return ;
+      tmp = 0 ;
+      for (i = 0 ; i <= _alphabetSize ; ++i)
+      {
+        if (alphabetCounts[i] > 0)
+          ++tmp ;
+      }
+      if (tmp == 1) // the next character is the same for all the suffixes in the range
+      {
+        ++d ;
+        continue ;
+      }
+
+      tmp = 0 ;
+      for (i = 0 ; i <= _alphabetSize ; ++i)
+      {
+        tmp += alphabetCounts[i] ;
+        if (tmp >= (e - s + 1) / 2)
+          break ;
+      }
+      pivot = i ; // pivot character
+      if (pivot > 0)
+        --pivot ;
+      break ;
+    }
+    
+    // Partition
+    size_t pi, pj, pk ; // partiation indexes
+    pi = s ; // pi points to the first element of the middle chunk
+    pj = s ; // pj is the current element
+    pk = e ; // pk points the last element of the middle chunk
+    while (pj <= pk)
+    {
+      int comparePivot = 0 ;
+      if (sa[pj] + d >= n)
+        comparePivot = -1 ;
+      else
+      {
+        size_t c = T.Read(sa[pj] + d) ;
+        if (c < pivot)
+          comparePivot = -1 ;
+        else if (c == pivot)
+          comparePivot = 0 ;
+        else
+          comparePivot = 1 ;
+      }
+
+      if (comparePivot == -1)
+      {
+        Swap(sa[pi], sa[pj]) ;
+        ++pi ; ++pj ;
+      }
+      else if (comparePivot == 1)
+      {
+        Swap(sa[pj], sa[pk]) ;
+        if (pk == 0)
+          break ;
+        --pk ;
+      }
+      else
+        ++pj ;
+    }
+    
+    // Recursive sorting
+    if (pi >= 1)
+      MultikeyQSort(T, n, sa, m, s, pi - 1, d, dcStrategy, alphabetCounts) ;
+    MultikeyQSort(T, n, sa, m, pi, pk, d + 1, dcStrategy, alphabetCounts) ;
+    MultikeyQSort(T, n, sa, m, pj, e, d, dcStrategy, alphabetCounts) ;
+  }
+
+  // Sort the suffixes in the difference cover
+  // @return: the suffix array of the difference cover
+  size_t *SortSuffixInDC(const FixedSizeElemArray &T, size_t n)
+  {
+    size_t i, k ;
+    size_t *sa ;
+    size_t *rank ; // rank of a suffix consider the prefix of size k, allowing ties
+    size_t *nextBuffer ; // buffer for next iteration (double expanded) information
+    size_t *count ; // cumulative rank count 
+    size_t maxRank ; // distinct ranks
+    size_t dci ; // compacted difference cover index
+    int v = _dc.GetV() ;
+    
+    sa = (size_t *)malloc(sizeof(size_t) * _dcSize) ;
+    rank = (size_t *)malloc(sizeof(size_t) * _dcSize) ;
+    nextBuffer = (size_t *)malloc(sizeof(size_t) * _dcSize) ;
+    count = (size_t *)malloc(sizeof(size_t) * _dcSize) ;
+    
+    // Sort by their first v characters
+    _dc.GetDiffCoverList(n, sa) ;
+    size_t *alphabetCounts = (size_t *)malloc(sizeof(size_t) * (_alphabetSize + 1)) ;
+    MultikeyQSort(T, n, sa, _dcSize, 0, _dcSize - 1, 0, /*dcStrategy=*/2, alphabetCounts) ;
+    free(alphabetCounts) ; 
+    
+    maxRank = 0 ;
+    count[0] = 0 ;
+    for (i = 0 ; i < _dcSize ; ++i)
+    {
+      if (i > 0 && T.SubrangeCompare( sa[i - 1], sa[i - 1] + v - 1, T, sa[i], sa[i] + v - 1))
+      {
+        ++maxRank ;
+        count[maxRank] = 0 ;
+      }
+      dci = _dc.CompactIndex( sa[i] ) ; 
+      rank[dci] = maxRank ;
+      ++count[maxRank] ;
+    }
+    for (i = 1 ; i <= maxRank ; ++i)
+      count[i] += count[i - 1] ; 
+
+    // Sorting difference cover using Manber-Myers algorithm 
+    size_t *tmpSwap ;
+    for (k = v ; k < n /*&& maxRank < _dcSize - 1*/ ; k <<= 1)
+    {
+      // Get the new SA, nextBuffer serves as the next SA 
+      size_t ri ; // reverse i
+      for (ri = 0 ; ri < _dcSize; ++ri)
+      {
+        i = _dcSize - 1 - ri ;
+
+        dci = _dc.CompactIndex( sa[i] ) ;
+        if (sa[i] >= n - k)
+          nextBuffer[i] = sa[i] ;
+        if (sa[i] < k)
+          continue ;
+        size_t kbeforeDci = _dc.CompactIndex(sa[i] - k) ;
+        nextBuffer[ count[ rank[kbeforeDci] ] - 1 ] = sa[i] - k ;
+        --count[ rank[kbeforeDci] ] ;
+      }
+      //memcpy(sa, nextBuffer, sizeof(sa[0]) * _dcSize) ;
+      tmpSwap = sa ;
+      sa = nextBuffer ;
+      nextBuffer = tmpSwap ;
+
+      // Update the rank, nextBuffer serves as the next rank.
+      maxRank = 0 ;
+      count[0] = 0 ;
+      size_t prevDci = 0 ;
+      for (i = 0 ; i < _dcSize ; ++i)
+      {
+        dci = _dc.CompactIndex(sa[i]) ;
+        if (i > 0 && 
+            (rank[dci] != rank[prevDci] || 
+             sa[i - 1] + k >= n || sa[i] + k >= n ||
+             rank[ _dc.CompactIndex(sa[i - 1] + k)] != rank[_dc.CompactIndex(sa[i] + k)]))
+        {
+          ++maxRank ;
+          count[maxRank] = 0 ;
+        }
+        nextBuffer[dci] = maxRank ; 
+        ++count[maxRank] ;
+        prevDci = dci ;
+      }
+      tmpSwap = rank ;
+      rank = nextBuffer ;
+      nextBuffer = tmpSwap ; 
+
+      if (maxRank >= _dcSize - 1)
+        break ;
+
+      for (i = 1 ; i <= maxRank ; ++i)
+        count[i] += count[i - 1] ;
+    }
+
+    free(nextBuffer) ;
+    free(count) ;
+    
+    _dcISA = rank ;
+    return sa ;
+  }
+
+public:
+  SuffixArrayGenerator() 
+  {
+    _b = 1<<24 ;  // 2^24, 16MB block size by default 
+    _n = _space = 0 ;
+    _cuts = NULL ;
+    _dcISA = NULL ;
+  }
+
+  ~SuffixArrayGenerator() 
+  {
+    Free() ;
+  }
+
+  void Free() 
+  {
+    _space = 0 ;
+  
+    if (_cuts != NULL)
+    {
+      free(_cuts) ;
+      if (_cutLCP != NULL)
+      {
+        size_t i ;
+        for (i = 0 ; i < _cutCnt ; ++i)
+          free(_cutLCP[i]) ;
+        free(_cutLCP) ;
+      }
+    }
+    if (_dcISA != NULL)
+      free(_dcISA) ;
+  }
+
+  size_t GetSpace()
+  {
+    return _space + sizeof(*this) ;
+  }
+
+  // Initialize the generator to obtain the _cuts
+  // _dcv: difference cover period
+  // @return: the number of _cuts
+  size_t Init(const FixedSizeElemArray &T, size_t n, size_t b, int dcv, int alphabetSize)
+  {
+    this->_n = n ;
+    if (b > 0)
+      this->_b = b ;
+    this->_alphabetSize = alphabetSize ;
+    _dc.Init(dcv) ;
+    _dcSize = _dc.GetSize(n) ;
+    size_t *dcSA = SortSuffixInDC(T, n) ; 
+    
+    GenerateCuts(dcSA) ; 
+    ComputeCutLCP(T, n, dcv) ;
+    free(dcSA) ;
+    return _cutCnt ;
+  }
+
+  size_t GetChunkCount()
+  {
+    return _cutCnt ;
+  }
+
+  // Generate the from-th chunk to to-th chunk for T[s..e], both are inclusive
+  // Each chunk is left close, right open for the cut.
+  // The procedure utilized _cutLCP to expediate the search.
+  void  GetChunksPositions(const FixedSizeElemArray &T, size_t n, size_t from, size_t to, size_t s, size_t e, std::vector< std::vector<size_t> > &pos)
+  {
+    size_t i, j ;
+    if (to >= _cutCnt)
+      to = _cutCnt - 1 ;
+    if (e >= n)
+      e = n - 1 ;
+    std::vector< std::vector<size_t> >().swap(pos) ;
+    for (j = from ; j <= to ; ++j)
+      pos.push_back( std::vector<size_t>() ) ;
+    
+    size_t *rightmosti = (size_t *)calloc(to - from + 2, sizeof(size_t)) ;
+    size_t *rightmostj = (size_t *)calloc(to - from + 2, sizeof(size_t)) ;
+    for (i = s ; i <= e ; ++i)
+    {
+      if ((from == 0 || CompareCutUsingCutLCP(T, n, i, from, rightmosti[from - from], rightmostj[from - from]) >= 0)
+          && (to == _cutCnt - 1 
+              || CompareCutUsingCutLCP(T, n, i, to + 1, rightmosti[to + 1 - from], rightmostj[to + 1 - from]) < 0))
+      {
+        for (j = from + 1 ; j <= to ; ++j)
+        {
+          if (CompareCutUsingCutLCP(T, n, i, j, rightmosti[j-from], rightmostj[j-from]) < 0)
+            break ;
+        }
+        pos[(j-1) - from].push_back(i) ;
+      }
+    }
+    free(rightmosti) ;
+    free(rightmostj) ;
+  }
+
+  void SortSuffixByPos(const FixedSizeElemArray &T, size_t n, size_t *pos, size_t m, size_t *sa)
+  {
+    if (m == 0)
+      return ;
+    size_t i ;
+    if (sa != pos)
+      for (i = 0 ; i < m ; ++i)
+        sa[i] = pos[i] ;
+    size_t *alphabetCounts = (size_t *)malloc(sizeof(size_t) * (_alphabetSize + 1)) ;
+    MultikeyQSort(T, n, sa, m, 0, m - 1, 0, /*dcStrategy=*/1, alphabetCounts) ;
+    free(alphabetCounts) ;
+  }
+
+  // Functions relating to use disk to hold chunks
+  // Output each chunk to prefix_{xxx}.chunk file
+  void OutputChunksToFiles(char *prefix)
+  {
+    int i ;
+    char filename[1024] ;
+    FILE **fps ;
+    fps = (FILE **)malloc(sizeof(FILE*) * _cutCnt) ;
+    for (i = 0 ; i < (int)_cutCnt ; ++i)
+    {
+      sprintf(filename, "%s_%d.chunk", prefix, i) ;
+      fps[i] = fopen(filename, "w") ;
+    }
+
+    for (i = 0 ; i < (int)_cutCnt ; ++i)
+      fclose(fps[i]) ;
+    free(fps) ;
+  }
+
+  // Read in the i-th chunk file  
+  void ReadChunkFile(char *prefix, int i, std::vector<size_t> &pos)
+  {
+    char filename[1024] ;
+    sprintf(filename, "%s_%d.chunk", prefix, i) ;
+    FILE *fp = fopen(filename, "r") ;
+    
+    fclose(fp) ;
+  }
+
+  // Remove all the temporary chunk files
+  void CleanChunkFiles(char *prefix)
+  {
+    char filename[1024] ;
+    for (int i = 0 ; i < (int)_cutCnt ; ++i)
+    {
+      sprintf(filename, "%s_%d.chunk", prefix, i) ;
+    }
+  }
+
+  // Use the Theorem 2 from "Fast Lightweight Suffix Array Construction and Checking"
+  // The simpler implementation requiring creating ISA, so it is memory intensive
+  bool ValidateSA(const FixedSizeElemArray &T, size_t n, size_t *sa)
+  {
+    size_t i ;
+    size_t *isa = (size_t *)malloc(sizeof(size_t) * n) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (sa[i] >= n)
+        return false ;
+      if (i > 0)
+      {
+        if (sa[i - 1] == sa[i])
+          return false ;
+        if (T.Read(sa[i - 1]) > T.Read(sa[i]))
+          return false ;
+      }
+    }
+    for (i = 0 ; i < n ; ++i)
+      isa[sa[i]] = i ;   
+    
+    for (i = 1 ; i < n ; ++i)
+    {
+      if (T.Read(sa[i - 1]) == T.Read(sa[i]))
+      {
+        if (sa[i-1] + 1 < n && sa[i] + 1 < n)
+        {
+          if (isa[ sa[i - 1] + 1] > isa[ sa[i] + 1])
+            return false ;
+        }
+        else if (sa[i] + 1 == n) // only the previous can be followed by the end of the string 
+          return false ;
+      }
+    }
+
+    free(isa) ;
+    return true ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Tree.hpp b/compactds/Tree.hpp
new file mode 100644
index 0000000..34063ab
--- /dev/null
+++ b/compactds/Tree.hpp
@@ -0,0 +1,167 @@
+#ifndef _MOURISL_COMPACTDS_TREE
+#define _MOURISL_COMPACTDS_TREE
+
+#include "Utils.hpp"
+
+namespace compactds {
+  
+class Tree 
+{
+protected:
+  size_t _space ;
+  size_t _n ; // number of nodes in tree
+
+public:
+  Tree() 
+  {
+    _space = 0 ;
+    _n = 0 ;
+  }
+  ~Tree() {}
+
+  virtual size_t GetSpace(bool inclusive) = 0 ;
+
+  virtual size_t Root() const = 0 ;
+  virtual size_t ChildSelect(size_t v, size_t t) const = 0 ; // Get the t-th (1-based) child of v
+  virtual size_t FirstChild(size_t v) const = 0 ;
+  virtual size_t LastChild(size_t v) const = 0 ;
+  virtual size_t ChildrenCount(size_t v) const = 0 ;
+  virtual size_t ChildRank(size_t v) const = 0 ; // Rank is always 1-based
+
+  virtual size_t NextSibling(size_t v) const = 0 ;
+  virtual size_t PrevSibling(size_t v) const = 0 ;
+
+	virtual size_t Parent(size_t v) const = 0 ;
+  virtual bool IsLeaf(size_t v) const = 0 ;
+  
+	virtual size_t NodeMap(size_t v) const = 0 ;
+  virtual size_t NodeSelect(size_t i) const = 0 ;
+  
+  // Whether u is an ancestor of v.
+  virtual bool IsAncestor(size_t u, size_t v) const
+  {
+    size_t p = v ;
+    while (p != Root() && p != u)
+      p = Parent(p) ;
+    if (p == u)
+      return true ;
+    else
+      return false ;
+  }
+
+  virtual size_t Depth(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+    size_t ret = 1 ;
+    size_t p = Parent(v) ;
+    while (p != Root())
+    {
+      p = Parent(p) ;
+      ++ret ;
+    }
+    return ret ;
+  }
+
+  virtual size_t LeafCountInSubTree(size_t v) const
+  {  
+    if (IsLeaf(v))
+      return 1 ;
+    size_t i ;
+    size_t childCnt = ChildrenCount(v) ;
+    size_t ret = 0 ;
+    for (i = 0 ; i < childCnt ; ++i)
+      ret += LeafCountInSubTree( ChildSelect(v, i + 1) ) ;
+    return ret ;
+  }
+
+  virtual size_t SubTreeSize(size_t v) const
+  {  
+    if (IsLeaf(v))
+      return 1 ;
+    size_t i ;
+    size_t childCnt = ChildrenCount(v) ;
+    size_t ret = 0 ;
+    for (i = 0 ; i < childCnt ; ++i)
+      ret += SubTreeSize( ChildSelect(v, i + 1) ) ;
+    return ret + 1 ;
+  }
+
+  virtual bool IsFirstChild(size_t v) const
+  {
+    if (v == Root())
+      return true ;
+    if (ChildRank(v) == 1)
+      return true ;
+    return false ;
+  }
+
+  virtual bool IsLastChild(size_t v) const
+  {
+    if (v == Root())
+      return true ;
+
+    size_t p = Parent(v) ;
+    size_t pChildCnt = ChildrenCount(p) ;
+    if (ChildRank(v) == pChildCnt)
+      return true ;
+    return false ;
+  }
+  
+  virtual size_t LCA(size_t u, size_t v) const
+  {
+    SimpleVector<size_t> upath ; 
+    SimpleVector<size_t> vpath ;
+
+    size_t p ;
+    
+    upath.PushBack(u) ;
+    p = Parent(u) ;
+    while (p != 0)
+    {
+      upath.PushBack(p) ;
+      p = Parent(p) ;
+    }
+    upath.PushBack(0) ;
+    
+    vpath.PushBack(v) ;
+    p = Parent(v) ;
+    while (p != 0)
+    {
+      vpath.PushBack(p) ;
+      p = Parent(p) ;
+    }
+    vpath.PushBack(0) ;
+    
+    upath.Reverse() ;
+    vpath.Reverse() ;
+
+    size_t size = MIN(upath.Size(), vpath.Size()) ;
+    size_t i ;
+    for (i = 0 ; i < size; ++i)
+      if (upath[i] != vpath[i])
+        break ;
+    return upath[i - 1] ;
+  }
+
+  size_t GetSize() const
+  {
+    return _n ;
+  }
+
+  virtual void Save(FILE *fp) 
+  {
+    SAVE_VAR(fp, _space) ;
+    SAVE_VAR(fp, _n) ;
+  }
+
+  virtual void Load(FILE *fp)
+  {
+    LOAD_VAR(fp, _space) ;
+    LOAD_VAR(fp, _n) ;
+  }
+} ;
+
+} // end of namespace
+
+#endif
diff --git a/compactds/Tree_BP.hpp b/compactds/Tree_BP.hpp
new file mode 100644
index 0000000..67e9e59
--- /dev/null
+++ b/compactds/Tree_BP.hpp
@@ -0,0 +1,316 @@
+#ifndef _MOURISL_COMPACTDS_TREE_BP
+#define _MOURISL_COMPACTDS_TREE_BP
+
+// Represent a tree by balanced parenthesis (Chapter 8.2)
+// Each v points to a parenthesis like (...) containing the substree 
+// The implementation details might be different as we are using 0-index 
+//    tree node id (i) as well.
+// In the implementation, v is for the index on the encoded bitvector B.
+
+#include "Tree.hpp"
+#include "Tree_Plain.hpp"
+#include "Tree_Cardinal_Plain.hpp"
+#include "Bitvector_Plain.hpp"
+#include "DS_Parenthesis.hpp"
+
+namespace compactds {
+class Tree_BP: public Tree
+{
+private:
+  Bitvector_Plain _B ; // bits representation of the parenthesis 
+  DS_Parenthesis _bp ; // dangling structure
+ 
+  // DFS to mark the parenthesis as B 
+  // tag: tree node id. bi: index on B
+  void Build(const struct _plainTreeNode *treeNodes, size_t n, size_t tag, 
+      size_t *treeIdMap, size_t &visited, size_t &bi)
+  {
+    size_t c ;
+    _B.BitSet(bi) ;
+    
+    treeIdMap[tag] = visited ;
+    ++visited ;
+
+    ++bi ;
+    for (c = treeNodes[tag].child ; c != 0 ; c = treeNodes[c].sibling) 
+      Build(treeNodes, n, c, treeIdMap, visited, bi) ;
+    //_B.BitClear(bi) ; // close the parentehsis
+    ++bi ;
+  }
+  
+  void BuildFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t childCnt, size_t tag, size_t *treeIdMap, size_t &visited, size_t &bi)
+  {
+    size_t i ;
+    _B.BitSet(bi) ;
+
+    treeIdMap[tag] = visited ;
+    ++visited ;
+
+    ++bi ;
+    for (i = 0 ; i < childCnt ; ++i)
+    {
+      size_t c = treeNodes[tag].children[i] ;
+      if (c == 0)
+        continue ;
+      BuildFromCardinalTree(treeNodes, n, childCnt, c, treeIdMap, visited, bi) ;
+    }
+    //_B.BitClear(bi) ; // close the parentehsis
+    ++bi ;
+  }
+public:
+  Tree_BP() {} 
+  ~Tree_BP()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (_n > 0)
+    {
+      _B.Free() ;
+      _bp.Free() ;
+      _n = 0 ;
+    }
+  }
+
+  size_t GetSpace(bool inclusive = true) 
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ;
+  }
+
+  void Init(const struct _plainTreeNode *treeNodes, size_t n, size_t *treeIdMap)
+  {
+    _n = n ;
+    _B.Malloc(2 * n) ; 
+    
+    size_t bi = 0 ;
+    size_t visited = 0 ;
+    Build(treeNodes, n, 0, treeIdMap, visited, bi) ;
+    
+    _B.Init() ;
+    
+    // the last 2,2 is for pattern 10 as "()"
+    // Note that due to our reading the bits from low to high, "1" will be the first bit
+    //  In other word, order is reversed.
+    _bp.Init(_B.GetData(), 2 * _n, 1, 2) ; 
+    
+    _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ;
+  }
+
+  void InitFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t childCount, size_t *treeIdMap)
+  {
+    _n = n ;
+    _B.Malloc(2 * n) ; 
+
+    size_t bi = 0 ;
+    size_t visited = 0 ;
+    BuildFromCardinalTree(treeNodes, n, childCount, 0, treeIdMap, visited, bi) ;
+
+    _B.Init() ;
+
+    // the last 2,2 is for pattern 10 as "()"
+    // Note that due to our reading the bits from low to high, "1" will be the first bit
+    //  In other word, order is reversed.
+    _bp.Init(_B.GetData(), 2 * _n, 1, 2) ; 
+
+    _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ;
+  }
+  // The index in B
+  size_t Root() const
+  {
+    return 0 ;
+  }
+
+  // @return: the t-th child (1-based) of node v in B vector
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    return _bp.Open(_bp.GetRmmTree().MinSelect(v + 1, _bp.Close(v, _B.GetData(), 2 * _n) - 1,
+        t, _B.GetData(), 2 * _n), _B.GetData(), 2*_n) ; 
+  } 
+
+  size_t FirstChild(size_t v) const
+  {
+    return v + 1 ;
+  }
+
+  // Parenthesis like
+  // (...(...))
+  // |   |
+  // v   lastchild
+  size_t LastChild(size_t v) const
+  {
+    return _bp.Open(_bp.Close(v, _B.GetData(), 2*_n)-1, _B.GetData(), 2*_n) ;
+  }
+  
+  size_t ChildrenCount(size_t v) const
+  {
+    if (IsLeaf(v))
+      return 0 ;
+    // Each child's (...) has excess 0 after the end.
+    return _bp.GetRmmTree().MinCount(v + 1, _bp.Close(v, _B.GetData(), 2*_n) - 1, 
+        _B.GetData(), 2*_n) ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+    size_t p = Parent(v) ;
+    if (p + 1 == v)
+      return 1 ;
+    return _bp.GetRmmTree().MinCount(p + 1, v - 1, _B.GetData(), 2*_n) + 1 ;
+  }
+  
+  // The silbing function assumes v has
+  //   those siblings.
+  size_t NextSibling(size_t v) const
+  {
+    return _bp.Close(v, _B.GetData(), 2*_n) + 1 ;
+  }
+
+  size_t PrevSibling(size_t v) const
+  {
+    return _bp.Open(v - 1, _B.GetData(), 2*_n) ;
+  }
+  
+  size_t Parent(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+    return _bp.Enclose(v, _B.GetData(), 2*_n) ;
+  }
+
+  bool IsLeaf(size_t v) const 
+  {
+    if (_B.Access(v + 1) == 0)
+      return true ;
+    return false ;
+  }
+
+  size_t LCA(size_t u, size_t v) const
+  {
+    if (u > v)
+    {
+      size_t tmp = u ;
+      u = v ;
+      v = tmp ;
+    }
+
+    //printf("%d %d: %d %d\n", u, v, _bp.GetRmmTree().Rmq(u, v, _B.GetData(), 2*_n) + 1,
+    //    _bp.Enclose(_bp.GetRmmTree().Rmq(u, v, _B.GetData(), 2*_n) + 1, _B.GetData(), 2*_n)) ;
+    return _bp.Enclose(_bp.GetRmmTree().Rmq(u, v, _B.GetData(), 2*_n) + 1, _B.GetData(), 2*_n) ;
+  }
+  
+  // Maps index in B (v) back up to the actual node id
+  // Pre-order
+  size_t NodeMap(size_t v) const
+  {
+    return _B.Rank(1, v, /*inclusive=*/0) ; 
+  }
+
+  //Map actual node id to index in B (v).
+  // Pre-order Select
+  size_t NodeSelect(size_t i) const 
+  {
+    return _B.Select(1, i + 1) ; 
+  }
+
+  size_t PostOrder(size_t v) const
+  {
+    return _B.Rank(0, _bp.Close(v, _B.GetData(), 2 * _n), /*inclusive*/0) ;
+  }
+
+  size_t PostOrderSelect(size_t i) const
+  {
+    return _bp.Open(_B.Select(0, i + 1), _B.GetData(), 2 * _n) ;   
+  }
+
+  // Root has depth 0
+  size_t Depth(size_t v) const
+  {
+    // Kind of excess
+    // inclusive=0 means rank-1 here
+    return 2 * _B.Rank(1, v, /*inclusive=*/0) - v ;
+  }
+
+  // # of nodes in the substree, inclusive. 
+  size_t SubTreeSize(size_t v) const
+  {
+    return (_bp.Close(v, _B.GetData(), 2 * _n) - v + 1) / 2 ;
+  }
+
+  // Whether u is an ancestor of v.
+  bool IsAncestor(size_t u, size_t v) const
+  {
+    size_t uclose = _bp.Close(u, _B.GetData(), 2 * _n) ;
+    if (u <= v && v <= uclose)
+      return true ;
+    return false ;
+  }
+
+  // The ancestor at d levels above
+  size_t LevelAncestor(size_t v, int64_t d) const
+  {
+    return _bp.GetRmmTree().BwdSearch(v, -d, _B.GetData(), 2 * _n) ;
+  }
+
+  // Would be v it self if v is the leaf.
+  size_t DeepestNode(size_t v) const
+  {
+    return _bp.GetRmmTree().RMq(v, _bp.Close(v, _B.GetData(), 2 * _n), _B.GetData(), 2 * _n) ;
+  }
+
+  // The distance from v to the deepest leaf
+  // 0 if v is the leaf
+  size_t Height(size_t v) const
+  {
+    size_t depthv = Depth(v) ;
+    size_t depthc = Depth( DeepestNode(v) ) ;
+    return depthc - depthv ;
+  }
+
+  // Number of leaves in the subtree of v
+  size_t LeafCountInSubTree(size_t v) const
+  {
+    if (IsLeaf(v))
+      return 1 ;
+    else
+    {
+      // Since close(v) is ")", the LeafRank(close(v)) is automatically exclusive
+      return LeafRank( _bp.Close(v, _B.GetData(), 2 *_n)) - LeafRank(v) ;
+    }
+  }
+
+  // Rank and select with respect to leaves in _B order
+  size_t LeafRank(size_t v, int inclusive = 1) const
+  {
+    return _bp.PatternRank(v, _B.GetData(), 2*_n, inclusive) ;
+  }
+
+  size_t LeafSelect(size_t i) const
+  {
+    return _bp.PatternSelect(i, _B.GetData(), 2*_n) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Tree::Save(fp) ;
+    _B.Save(fp) ;
+    _bp.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+    Tree::Load(fp) ;
+    
+    _B.Load(fp) ;
+    _bp.Load(fp) ;
+  }
+} ;
+
+} // end of name space
+
+#endif
diff --git a/compactds/Tree_Cardinal.hpp b/compactds/Tree_Cardinal.hpp
new file mode 100644
index 0000000..a260351
--- /dev/null
+++ b/compactds/Tree_Cardinal.hpp
@@ -0,0 +1,43 @@
+#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL
+#define _MOURISL_COMPACTDS_TREE_CARDINAL
+
+#include "Utils.hpp"
+
+namespace compactds {
+  
+class Tree_Cardinal: public Tree 
+{
+protected:
+  size_t _c ; // cardinality (number of max children)
+public:
+  Tree_Cardinal() 
+  {
+    _space = 0 ;
+    _n = 0 ;
+    _c = 0 ;
+  }
+  ~Tree_Cardinal() {}
+
+  // Number of children with label l. 1: has such children. 0-don't 
+  virtual size_t ChildrenLabeled(size_t v, size_t l) const = 0 ;
+  // The child with label l.
+  virtual size_t LabeledChild(size_t v, size_t l) const = 0 ;
+  // The label of the edge that leads to node v.
+  virtual size_t ChildLabel(size_t v) const = 0 ;
+
+  virtual void Save(FILE *fp)
+  {
+    Tree::Save(fp) ;
+    SAVE_VAR(fp, _c) ;
+  }
+
+  virtual void Load(FILE *fp)
+  {
+    Tree::Load(fp) ;
+    LOAD_VAR(fp, _c) ;
+  }
+} ;
+
+} // end of namespace
+
+#endif
diff --git a/compactds/Tree_Cardinal_LOUDS.hpp b/compactds/Tree_Cardinal_LOUDS.hpp
new file mode 100644
index 0000000..67d0b75
--- /dev/null
+++ b/compactds/Tree_Cardinal_LOUDS.hpp
@@ -0,0 +1,203 @@
+#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL_LOUDS
+#define _MOURISL_COMPACTDS_TREE_CARDINAL_LOUDS
+
+// Level-order Unary degree sequence for cardinal tree (Chapter 8.1.1, Algorithm 8.4)
+// The implementation details might be different as we are using 0-index 
+//    tree node id (i) as well.
+// In the implementation, v is for the index on the encoded bitvector B 
+//  to be consistent with the genral LOUDS represent. This is DIFFERENT
+//  from the textbook.
+
+#include "Tree_Cardinal.hpp"
+#include "Tree_Cardinal_Plain.hpp"
+#include "Bitvector_Plain.hpp"
+
+namespace compactds {
+template <class BvClass = Bitvector_Plain>
+class Tree_Cardinal_LOUDS: public Tree_Cardinal
+{
+private:
+  BvClass _B ;
+public:
+  size_t GetSpace(bool inclusive = true) 
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ;
+  }
+  
+  // Use c bits per node.
+  void Init(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t c, size_t *treeIdMap)
+  {
+    size_t i ;
+    WORD *W = Utils::MallocByBits(c * n) ;
+
+    _n = n ;
+    _c = c ;
+    
+    // BFS on tree nodes
+    // The algorithm 8.3 will change the original plain tree
+    size_t *queue = (size_t *)malloc(sizeof(size_t) * n) ;
+    size_t qhead, qtail ;
+    queue[0] = 0 ;
+    qhead = 0 ;
+    qtail = 1 ;
+    while (qhead < qtail)
+    {
+      size_t node = queue[qhead] ;
+      if (treeIdMap != NULL)
+        treeIdMap[node] = qhead ;
+
+      ++qhead ;
+      
+      for (i = 0 ; i < _c ; ++i)
+      {
+        if (treeNodes[node].children[i] != 0)
+        {
+          Utils::BitSet(W, _c * (qhead - 1) + i) ;
+          queue[qtail] = treeNodes[node].children[i] ;
+          ++qtail ;
+        }
+      }
+    }
+    free(queue) ;
+
+    _B.Init(W, c * n) ;
+    _space = _B.GetSpace() - sizeof(_B);
+    free(W) ;
+  }
+
+  // The index in B
+  size_t Root() const
+  {
+    return 0 ;
+  }
+
+  // @return: the t-th child (0-based) of node v in B vector
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    return (_B.Rank(1, v, /*inclusive=*/0) + t) * _c  ; 
+  }
+
+  size_t FirstChild(size_t v) const 
+  {
+    return (_B.Rank(1, v, /*inclusive=*/0) + 1) * _c ; 
+  }
+
+  size_t LastChild(size_t v) const 
+  {
+    return (_B.Rank(1, v + _c - 1)) * _c ;
+  }
+  
+  size_t ChildrenCount(size_t v) const
+  {
+    return _B.Rank(1, v + _c - 1) - _B.Rank(1, v, 0) ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    if ( v == Root())
+      return 0 ;
+    size_t tid = NodeMap(v) ;
+    size_t j = _B.Select(1, tid) ; // edge from parent to v
+    return _B.Rank(1, j) - _B.Rank(1, j/_c * _c, /*inclusive=*/0) ;
+  }  
+  
+  // The silbing function assumes v has
+  //   those siblings.
+  size_t NextSibling(size_t v) const
+  {
+    return v + 1 ;
+  }
+
+  size_t PrevSibling(size_t v) const
+  {
+    return v - 1 ; 
+  }
+  
+  size_t Parent(size_t v) const
+  {
+    if (v == 0)
+      return Root() ;
+    else
+    {
+     size_t tid = NodeMap(v) ;
+     // _B.Select(1, tid) identify the edge from the parent to v
+     // Notice that even though tree node id is 0-based, the edge starts
+     //   to correspnds to node id 1
+     return (_B.Select(1, tid) / _c) * _c ;
+    } 
+  }
+
+  bool IsLeaf(size_t v) const 
+  {
+    return ChildrenCount(v) == 0 ;
+  }
+
+  size_t LCA(size_t u, size_t v) const
+  {
+    while (u != v)
+    {
+      if (u > v)
+        u = Parent(u) ;
+      else
+        v = Parent(v) ;
+    }
+    return u ;
+  }
+  
+  // Maps index in B (v) back up to the actual node id
+  size_t NodeMap(size_t v) const
+  {
+    return v / _c ;
+  }
+
+  //Map actual node id to index in B (v).
+  size_t NodeSelect(size_t i) const 
+  {
+    if (i == 0)
+      return Root() ;
+    else
+      return i * _c ;
+  }
+
+  // Number of children with label l. 1: has such children. 0-don't 
+  size_t ChildrenLabeled(size_t v, size_t l) const 
+  {
+    return (_B.Access(v + l) == 1) ? 1 : 0 ;
+  }
+
+  // The childr with label l.
+  // Assuming label l's child exist
+  // Notice the difference from Child()
+  size_t LabeledChild(size_t v, size_t l) const 
+  {
+    return (_B.Rank(1, v + l, /*inclusive=*/1)) * _c  ; 
+  }
+
+  // The label of the edge that leads to node v.
+  size_t ChildLabel(size_t v) const 
+  {
+    if (v == Root())
+      return 0 ;
+    size_t tid = NodeMap(v) ;
+    size_t j = _B.Select(1, tid) ; // edge from parent to v
+    return j % _c ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Tree_Cardinal::Save(fp) ;
+    _B.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Tree_Cardinal::Load(fp) ;
+    _B.Load(fp) ;
+  }
+ 
+} ;
+
+} // end of name space
+
+#endif
diff --git a/compactds/Tree_Cardinal_Ordinal.hpp b/compactds/Tree_Cardinal_Ordinal.hpp
new file mode 100644
index 0000000..f3e5158
--- /dev/null
+++ b/compactds/Tree_Cardinal_Ordinal.hpp
@@ -0,0 +1,167 @@
+#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL_ORDINAL
+#define _MOURISL_COMPACTDS_TREE_CARDINAL_ORDINAL
+
+// Cardinal tree, where we use ordinal compact tree to store the structure
+//  and another bit vector to represent the concatenating labels
+
+#include "Tree_Cardinal.hpp"
+#include "Tree_Cardinal_Plain.hpp"
+#include "Bitvector_Plain.hpp"
+
+#include "Tree_BP.hpp"
+#include "Tree_DFUDS.hpp"
+
+namespace compactds {
+
+template <class TreeClass = Tree_DFUDS, class BvClass = Bitvector_Plain>
+class Tree_Cardinal_Ordinal: public Tree_Cardinal
+{
+private:
+  TreeClass _t ;
+  BvClass _B ; // concatenated labeling showing whether the children for this label exist
+public:
+  size_t GetSpace(bool inclusive = true) 
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ;
+  }
+  
+  // Use c bits per node.
+  void Init(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t c, size_t *treeIdMap)
+  {
+    size_t i, j ;
+
+    _n = n ;
+    _c = c ;
+    _t.InitFromCardinalTree(treeNodes, n, c, treeIdMap) ;
+     
+
+    WORD *W = Utils::MallocByBits(c * n) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      size_t k = treeIdMap[i] ;
+      for (j = 0 ; j < c ; ++j)
+      {
+        if (treeNodes[i].children[j] != 0)
+          Utils::BitSet(W, _c * k + j) ;
+      }
+    }
+    _B.Init(W, c * n) ;
+    free(W) ;
+
+    _space = _t.GetSpace(false) + _B.GetSpace() - sizeof(_B) ;
+  }
+
+  // The index in B
+  size_t Root() const
+  {
+    return _t.Root() ;
+  }
+
+  // @return: the t-th child (1-based) of node v in B vector
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    return _t.ChildSelect(v, t) ;
+  }
+
+  size_t FirstChild(size_t v) const 
+  {
+    return _t.FirstChild(v) ;
+  }
+
+  size_t LastChild(size_t v) const 
+  {
+    return _t.LastChild(v) ;
+  }
+  
+  size_t ChildrenCount(size_t v) const
+  {
+    return _t.ChildrenCount(v) ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    return _t.ChildRank(v) ;
+  }
+
+  // The silbing function assumes v has
+  //   those siblings.
+  size_t NextSibling(size_t v) const
+  {
+    return _t.NextSibling(v) ;
+  }
+
+  size_t PrevSibling(size_t v) const
+  {
+    return _t.PrevSibling(v) ;
+  }
+  
+  size_t Parent(size_t v) const
+  {
+    return _t.Parent(v) ;
+  }
+
+  bool IsLeaf(size_t v) const 
+  {
+    return _t.IsLeaf(v) ;
+  }
+
+  size_t LCA(size_t u, size_t v) const
+  {
+    return _t.LCA(u, v) ;
+  }
+  
+  // Maps index in B (v) back up to the actual node id
+  size_t NodeMap(size_t v) const
+  {
+    return _t.NodeMap(v) ;
+  }
+
+  //Map actual node id to index in B (v).
+  size_t NodeSelect(size_t i) const 
+  {
+    return _t.NodeSelect(i) ;
+  }
+
+  // Number of children with label l. 1: has such children. 0-don't 
+  size_t ChildrenLabeled(size_t v, size_t l) const 
+  {
+    return (_B.Access(NodeMap(v) * _c + l) == 1) ? 1 : 0 ;
+  }
+
+  // The child with label l.
+  // Assuming label l's child exist
+  // Notice the difference from Child()
+  size_t LabeledChild(size_t v, size_t l) const 
+  {
+    size_t k = NodeMap(v) ;
+    size_t r = _B.Rank1(k * _c + l) - _B.Rank1(k * _c) ;
+    return ChildSelect(v, r + 1) ;
+  }
+
+  // The label of the edge that leads to node v.
+  size_t ChildLabel(size_t v) const 
+  {
+    size_t p = NodeMap(Parent(v)) ;
+    size_t r = ChildRank(v) ;
+    return _B.Select( _B.Rank1(p * _c, 0) + r) - p * _c ; 
+  }
+
+  void Save(FILE *fp)
+  {
+    Tree_Cardinal::Save(fp) ;
+    _t.Save(fp) ;
+    _B.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Tree_Cardinal::Load(fp) ;
+    _t.Save(fp) ;
+    _B.Load(fp) ;
+  }
+} ;
+
+} // end of name space
+
+#endif
diff --git a/compactds/Tree_Cardinal_Plain.hpp b/compactds/Tree_Cardinal_Plain.hpp
new file mode 100644
index 0000000..fcdaafb
--- /dev/null
+++ b/compactds/Tree_Cardinal_Plain.hpp
@@ -0,0 +1,255 @@
+// Cardinal tree, plain representation
+//
+#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL_PLAIN
+#define _MOURISL_COMPACTDS_TREE_CARDINAL_PLAIN 
+
+#include "Tree_Cardinal.hpp"
+
+namespace compactds {
+struct _plainCardinalTreeNode
+{
+  size_t k ; // It is the k-th(0-based) children of the parent
+  size_t parent ;
+  size_t *children ;
+
+  _plainCardinalTreeNode()
+  {
+    children = NULL ;
+  }
+
+  ~_plainCardinalTreeNode() {} ;
+
+  _plainCardinalTreeNode(size_t p, size_t inK, size_t c)
+  {
+    parent = p ;
+    k = inK ;
+    children = (size_t *)calloc(c, sizeof(size_t)) ;
+  }
+
+  void Free()
+  {
+    if (children)
+    {
+      free(children) ;
+      children = NULL ;
+    }
+  }
+
+  void Save(FILE *fp, size_t c)
+  {
+    SAVE_VAR(fp, parent) ;
+    SAVE_ARR(fp, children, c) ;
+  }
+
+  void Load(FILE *fp, size_t c)
+  {
+    Free() ;
+
+    LOAD_VAR(fp, parent) ;
+    children = (size_t *)calloc(c, sizeof(size_t)) ;
+    LOAD_ARR(fp, children, c) ;
+  }
+} ;
+
+class Tree_Cardinal_Plain : public Tree_Cardinal
+{
+private:
+  std::vector<struct _plainCardinalTreeNode> _nodes ;
+  size_t _c ; //child count
+public:
+  Tree_Cardinal_Plain() {} ;
+  ~Tree_Cardinal_Plain() 
+  {
+    Free() ;
+  } 
+
+  void Init(size_t childCount)
+  {
+    _c = childCount ;
+    struct _plainCardinalTreeNode node(0, 0, _c) ;
+    _nodes.push_back(node) ;
+    _n = 1 ;
+  }
+
+  void Free()
+  {
+    size_t i ;
+    for (i = 0 ; i < _n ; ++i)
+      _nodes[i].Free() ;
+    _n = 0 ;
+  }
+  
+  size_t GetSpace(bool inclusive = true)
+  {
+    return _nodes.capacity() * sizeof(_nodes[0]) + (inclusive ? sizeof(*this) : 0) ;
+  }
+
+  size_t AddNode(size_t parent, size_t k) 
+  {
+    size_t id = _nodes.size() ; 
+    struct _plainCardinalTreeNode node(parent, k, _c) ;
+    
+    _nodes[parent].children[k] = id ;
+    _nodes.push_back(node) ;
+    ++_n ;
+
+    return id ;
+  }
+
+  size_t Root() const
+  {
+    return 0 ;
+  }
+
+  // t-th(1-based) child ;
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    size_t i ;
+    size_t cnt = 0 ;
+    for (i = 0 ; i < _c ; ++i)
+    {
+      if (_nodes[v].children[i] != 0)
+      {
+        if (cnt == t - 1)
+          return _nodes[v].children[i] ;
+        ++cnt ;
+      }
+    }
+    return 0 ;
+  }
+  
+  size_t FirstChild(size_t v) const 
+  {
+    size_t i ;
+    for (i = 0 ; i < _c ; ++i)
+      if (_nodes[v].children[i] != 0)
+        return _nodes[v].children[i] ;
+    return 0 ;
+  }
+
+  size_t LastChild(size_t v) const 
+  {
+    size_t i ; 
+    for (i = _c - 1 ; i < _c ; --i)
+      if (_nodes[v].children[i] != 0) 
+        return _nodes[v].children[i] ;
+    return 0 ;
+  }
+
+  size_t ChildrenCount(size_t v) const
+  {
+    size_t i ;
+    size_t c = FirstChild(v) ;
+    for (i = 0 ; c != 0 ; ++i)
+      c = NextSibling(c) ;
+    return i ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+    size_t p = Parent(v) ;
+    size_t ret = 0 ;
+    size_t i ;
+    
+    // inclusive, for rank is always 1-based
+    for (i = 0 ; i <= _nodes[v].k ; ++i)
+      if (_nodes[p].children[i] != 0)
+        ++ret ;
+    return ret ; 
+  }
+
+  size_t NextSibling(size_t v) const 
+  {
+    size_t i ;
+    size_t p = Parent(v) ;
+    for (i = _nodes[v].k + 1 ; i < _c ; ++i)
+      if (_nodes[p].children[i] != 0)
+        return _nodes[p].children[i] ;
+    return 0 ;
+  }
+
+  size_t PrevSibling(size_t v) const
+  {
+    size_t i ;
+    size_t p = Parent(v) ;
+    for (i = _nodes[v].k - 1 ; i < _c ; --i)
+      if (_nodes[p].children[i] != 0)
+        return _nodes[p].children[i] ;
+    return 0 ;
+  }
+
+  size_t Parent(size_t v) const 
+  {
+    return _nodes[v].parent ;
+  }
+
+  bool IsLeaf(size_t v) const 
+  {
+    return (ChildrenCount(v) == 0) ;
+  }
+
+  size_t NodeMap(size_t v) const 
+  {
+    return v ;
+  }
+
+  size_t NodeSelect(size_t i) const 
+  {
+    return i ;
+  }
+  
+  // Number of children with label l. 1: has such children. 0-don't 
+  size_t ChildrenLabeled(size_t v, size_t l) const
+  {
+    if (_nodes[v].children[l] != 0)
+      return 1 ;
+    else
+      return 0 ;
+  }
+
+  // The children with label l.
+  size_t LabeledChild(size_t v, size_t l) const
+  {
+    return _nodes[v].children[l] ;
+  }
+
+  // The label of the edge that leads to node v.
+  size_t ChildLabel(size_t v) const 
+  {
+    return _nodes[v].k ;
+  }
+
+  const std::vector<struct _plainCardinalTreeNode>& GetTreeData() const
+  {
+    return _nodes ; 
+  }
+
+  void Save(FILE *fp)
+  {
+    Tree_Cardinal::Save(fp) ;
+    size_t i ;
+    for (i = 0 ; i < _n ; ++i)
+      _nodes[i].Save(fp, _c) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    std::vector< struct _plainCardinalTreeNode >().swap(_nodes) ;    
+
+    Tree_Cardinal::Load(fp) ;
+    size_t i ;
+    struct _plainCardinalTreeNode node ;
+    for (i = 0 ; i < _n ; ++i)
+    {
+      node.Load(fp, _c) ;
+      _nodes.push_back(node) ;
+    }
+  }
+
+} ;
+}
+
+#endif
diff --git a/compactds/Tree_DFUDS.hpp b/compactds/Tree_DFUDS.hpp
new file mode 100644
index 0000000..11498a6
--- /dev/null
+++ b/compactds/Tree_DFUDS.hpp
@@ -0,0 +1,283 @@
+#ifndef _MOURISL_COMPACTDS_TREE_DFUDS
+#define _MOURISL_COMPACTDS_TREE_DFUDS
+
+// Depth-First Unary Degree Sequence (Chapter 8.3)
+// The implementation details might be different as we are using 0-index 
+//    tree node id (i) as well.
+// In the implementation, v is for the index on the encoded bitvector B.
+
+#include "Tree.hpp"
+#include "Tree_Plain.hpp"
+#include "Tree_Cardinal_Plain.hpp"
+#include "Bitvector_Plain.hpp"
+
+namespace compactds {
+class Tree_DFUDS: public Tree
+{
+private:
+  Bitvector_Plain _B ;
+  DS_Parenthesis _bp ; // dangling structure, the parentehesis is almost balanced.
+  
+  size_t _m ; //|_B|, just for coding simplicity
+  
+  void Build(const struct _plainTreeNode *treeNodes, size_t n, size_t tag, size_t *treeIdMap, size_t &visited, size_t &bi)
+  {
+    size_t c ;
+    c = treeNodes[tag].child ;
+    
+    treeIdMap[tag] = visited ;
+    ++visited ;
+
+    for (c = treeNodes[tag].child ; c != 0 ; c = treeNodes[c].sibling) 
+    {
+      _B.BitSet(bi) ;
+      ++bi ;
+    }
+    ++bi ; // set as 0
+
+    c = treeNodes[tag].child ;
+
+    for (c = treeNodes[tag].child ; c != 0 ; c = treeNodes[c].sibling) 
+      Build(treeNodes, n, c, treeIdMap, visited, bi) ;  
+  }
+
+  void BuildFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t childCnt, size_t tag, size_t *treeIdMap, size_t &visited, size_t &bi)
+  {
+    size_t i ;
+    treeIdMap[tag] = visited ;
+    ++visited ;
+
+    for (i = 0 ; i < childCnt; ++i) 
+    {
+      if (treeNodes[tag].children[i] == 0)
+        continue ;
+      _B.BitSet(bi) ;
+      ++bi ;
+    }
+    ++bi ;
+
+    for (i = 0 ; i < childCnt; ++i) 
+    {
+      if (treeNodes[tag].children[i] == 0)
+        continue ;
+      BuildFromCardinalTree(treeNodes, n, childCnt, treeNodes[tag].children[i], treeIdMap, visited, bi) ; 
+    }
+  }
+public:
+  Tree_DFUDS() {} 
+  ~Tree_DFUDS()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (_n > 0)
+    {
+      _B.Free() ;
+      _bp.Free() ;
+      _n = 0 ;
+      _m = 0 ;
+    }
+  }
+
+  size_t GetSpace(bool inclusive = true) 
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ;
+  }
+
+  void Init(const struct _plainTreeNode *treeNodes, size_t n, size_t *treeIdMap)
+  {
+    _n = n ;
+    _B.Malloc(2 * _n - 1) ;
+    size_t bi = 0 ;
+    size_t visited = 0 ;
+    Build(treeNodes, n, 0, treeIdMap, visited, bi) ;
+    _m = bi ;
+
+    _B.Init() ;
+    _bp.Init(_B.GetData(), _m, 0, 2) ;
+    
+    _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ;
+  }
+
+  void InitFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, 
+      size_t childCount, size_t *treeIdMap)
+  {
+    _n = n ;
+    _B.Malloc(2 * _n - 1) ;
+    size_t bi = 0 ;
+    size_t visited = 0 ;
+    BuildFromCardinalTree(treeNodes, n, childCount, 0, treeIdMap, visited, bi) ;
+    _m = bi ;
+
+    _B.Init() ;
+    _bp.Init(_B.GetData(), _m, 0, 2) ;
+    
+    _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ;
+  }
+
+  // The index in B
+  size_t Root() const
+  {
+    return 0 ;
+  }
+
+  // @return: the t-th child (1-based) of node v in B vector
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    size_t childCnt = ChildrenCount(v) ;
+    return _bp.Close( v + childCnt - t, _B.GetData(), _m) + 1 ;
+  }
+
+  size_t FirstChild(size_t v) const 
+  {
+    return _B.Succ0(v) + 1 ;
+  }
+
+  size_t LastChild(size_t v) const 
+  {
+    return _bp.Close(v, _B.GetData(), _m) + 1 ;
+  }
+  
+  size_t ChildrenCount(size_t v) const
+  {
+    return _B.Succ0(v) - v ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    size_t open = _bp.Open(v - 1, _B.GetData(), _m) ;
+    return  _B.Succ0(open) - open ;
+  }
+  
+  // The silbing function assumes v has
+  //   those siblings.
+  size_t NextSibling(size_t v) const
+  {
+    return _bp.GetRmmTree().FwdSearch(v, -1, _B.GetData(), _m) + 1 ; 
+  }
+
+  size_t PrevSibling(size_t v) const
+  {
+    // Notice the order here, the closer to the end of the (((), the close
+    //   ) corresponds to the earlier children
+    return _bp.Close(_bp.Open(v - 1, _B.GetData(), _m) + 1,  
+        _B.GetData(), _m) + 1 ;
+  }
+  
+  size_t Parent(size_t v) const
+  {
+    if (v == 0)
+      return 0 ;
+
+    return _B.Pred0(_bp.Open(v - 1, _B.GetData(), _m)) + 1 ;
+  }
+
+  // # of nodes in the substree, inclusive. 
+  size_t SubTreeSize(size_t v) const
+  {
+    // For 2*m-1 parenthesis in the range: m ')', m-1 '('
+    // The equation below is actually (2*m-2)/2 + 1
+    return (_bp.GetRmmTree().FwdSearch(v, -1, _B.GetData(), _m) - v) / 2 + 1 ;
+  }
+  
+  // Whether u is an ancestor of v.
+  bool IsAncestor(size_t u, size_t v) const
+  {
+    size_t uend = _bp.GetRmmTree().FwdSearch(u, -1, _B.GetData(), _m) ;
+    if (v >= u && v <= uend)
+      return true ;
+    return false ;
+  }
+
+  bool IsLeaf(size_t v) const 
+  {
+    return (_B.Access(v) == 0) ;
+  }
+
+  size_t LCA(size_t u, size_t v) const
+  {
+    if (v < u)
+    {
+      size_t tmp = v ;
+      v = u ;
+      u = tmp ;
+    }
+
+    if (IsAncestor(u, v))
+      return u ;
+    //printf("%s: %d %d. %d\n", __func__, u, v, 
+    //    _bp.GetRmmTree().Rmq(u, v - 1, _B.GetData(), _m)) ;
+    
+    // Think about this more.
+    // Example (())): node with two leaves 
+    // v-1 in Rmq then add 1 back handles both the leaf case and internal node case
+    return Parent( _bp.GetRmmTree().Rmq(u, v - 1,
+          _B.GetData(), _m) + 1)  ;
+  }
+
+  size_t LeafCountInSubTree(size_t v) const
+  {
+    if (IsLeaf(v))
+      return 1 ;
+    else
+    {
+      size_t vend = _bp.GetRmmTree().FwdSearch(v, -1, _B.GetData(), _m) ;
+      return _bp.PatternRank(vend - 1, _B.GetData(), _m) - _bp.PatternRank(v, _B.GetData(), _m)  ;
+    }
+  }
+
+  // Rank and select with respect to leaves in _B order
+  // Assuming v is the leaf
+  size_t LeafRank(size_t v, int inclusive = 1) const
+  {
+    // The end of 00 corresponds to the leaf
+    return _bp.PatternRank(v - 1, _B.GetData(), _m, inclusive) ;
+  }
+
+  size_t LeafSelect(size_t i) const
+  {
+    return _bp.PatternSelect(i, _B.GetData(), _m) + 1 ;
+  }
+  
+  // Maps index in B (v) back up to the actual node id
+  size_t NodeMap(size_t v) const
+  {
+    // Inclusive==0 because leaf node will have ')' on the index
+    return _B.Rank(0, v, /*inclusive=*/0) ; 
+  }
+
+  // Map actual node id to index in B (v).
+  size_t NodeSelect(size_t i) const 
+  {
+    if (i == 0)
+      return 0 ;
+    return _B.Select(0, i) + 1 ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Tree::Save(fp) ;
+
+    _B.Save(fp) ;
+    _bp.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    Free() ;
+
+    Tree::Load(fp) ;
+
+    _B.Load(fp) ;
+    _bp.Load(fp) ;
+    
+    _m = 2 * _n - 1 ;
+  }
+} ;
+
+} // end of name space
+
+#endif
diff --git a/compactds/Tree_LOUDS.hpp b/compactds/Tree_LOUDS.hpp
new file mode 100644
index 0000000..0450149
--- /dev/null
+++ b/compactds/Tree_LOUDS.hpp
@@ -0,0 +1,171 @@
+#ifndef _MOURISL_COMPACTDS_TREE_LOUDS
+#define _MOURISL_COMPACTDS_TREE_LOUDS
+
+// Level-order Unary degree sequence (Chapter 8.1)
+// The implementation details might be different as we are using 0-index 
+//    tree node id (i) as well.
+// In the implementation, v is for the index on the encoded bitvector B.
+
+#include "Tree.hpp"
+#include "Tree_Plain.hpp"
+#include "Bitvector_Plain.hpp"
+
+namespace compactds {
+class Tree_LOUDS: public Tree
+{
+private:
+  Bitvector_Plain _B ;
+public:
+  size_t GetSpace(bool inclusive = true) 
+  {
+    return _space + (inclusive ? sizeof(*this) : 0) ;
+  }
+
+  void Init(const struct _plainTreeNode *treeNodes, size_t n, size_t *treeIdMap)
+  {
+    size_t i ;
+    _B.Malloc(2 * n - 1) ; // n nodes, n-1 edges
+		_space = _B.GetSpace() - sizeof(_B); 
+  
+    // BFS on tree nodes
+    // The algorithm 8.3 will change the original plain tree
+    size_t *queue = (size_t *)malloc(sizeof(size_t) * n) ;
+    size_t qhead, qtail ;
+    size_t m = 0 ; // position on _B
+    queue[0] = 0 ;
+    qhead = 0 ;
+    qtail = 1 ;
+    while (qhead < qtail)
+    {
+      size_t node = queue[qhead] ;
+      if (treeIdMap != NULL)
+        treeIdMap[node] = qhead ;
+
+      ++qhead ;
+      
+      size_t childCnt = 0 ;
+      size_t c = treeNodes[node].child ;
+      for (childCnt = 0 ; c != 0 ; ++childCnt)
+      {
+        queue[qtail] = c ;
+        ++qtail ;
+        c = treeNodes[c].sibling ;
+      }
+      
+      for (i = m ; i < m + childCnt ; ++i)
+        _B.BitSet(i) ;
+      m += childCnt + 1 ;
+    }
+    free(queue) ;
+
+    _B.Init() ;
+  }
+
+  // The index in B
+  size_t Root() const
+  {
+    return 0 ;
+  }
+
+  // @return: the t-th child (1-based) of node v in B vector
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    return _B.Select(0, _B.Rank(1, v + t - 1)) + 1 ;
+  }
+
+  size_t FirstChild(size_t v) const 
+  {
+    return ChildSelect(v, 1) ;
+  }
+
+  size_t LastChild(size_t v) const 
+	{
+		return ChildSelect(v, ChildrenCount(v)) ;		
+	}
+  
+  size_t ChildrenCount(size_t v) const
+  {
+		return _B.Succ0(v) - v ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+		//_B.Rank(0, v-1): the node id
+		//_B.Select(1, _B.Rank(0, v-1)): The edge connect the parent and v
+    size_t j = _B.Select(1, _B.Rank(0, v - 1)) ;
+		return j - _B.Pred0(j) ; // rank is always 1-based
+  }
+  
+  // The silbing function assumes v has
+  //   those siblings.
+  size_t NextSibling(size_t v) const
+	{
+		return _B.Succ0(v) + 1 ;
+	}
+
+	size_t PrevSibling(size_t v) const
+	{
+		return _B.Pred0(v - 2) + 1 ; 
+	}
+  
+  size_t Parent(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+    size_t j = _B.Select(1, _B.Rank(0, v - 1)) ;
+    return _B.Pred0(j) + 1 ;
+  }
+
+  bool IsLeaf(size_t v) const
+  {
+    return _B.Access(v) == 0 ; 
+  }
+
+	size_t LCA(size_t u, size_t v) const
+	{
+		while (u != v)
+		{
+			if (u > v)
+				u = Parent(u) ;
+			else
+				v = Parent(v) ;
+		}
+		return u ;
+	}
+  
+  // Maps index in B (v) back up to the actual node id
+  size_t NodeMap(size_t v) const
+  {
+    // Exclude current 0 in case v is the leaf.
+		return _B.Rank(0, v, /*inclusive=*/0) ; 
+  }
+
+	//Map actual node id to index in B (v).
+  size_t NodeSelect(size_t i) const
+  {
+		if (i == 0)
+			return Root() ;
+		else
+			return _B.Select(0, i) + 1 ; 
+  }
+
+  void Save(FILE *fp)
+  {
+		Tree::Save(fp) ;
+		_B.Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+		Tree::Load(fp) ;
+		_B.Load(fp) ;
+	}
+ 
+} ;
+
+} // end of name space
+
+#endif
diff --git a/compactds/Tree_Labeled.hpp b/compactds/Tree_Labeled.hpp
new file mode 100644
index 0000000..3f63a95
--- /dev/null
+++ b/compactds/Tree_Labeled.hpp
@@ -0,0 +1,256 @@
+#ifndef _MOURISL_COMPACTDS_TREE_LABELED
+#define _MOURISL_COMPACTDS_TREE_LABELED
+
+// Labeled tree. The difference from the text book that we
+//   assume general representation of the tree structure, including BP.
+//   Therefore, for the concatenated children labels, we have an
+//   additional bit vector to indicate the start of each children label series,
+//   and we also have a place holder for the start position in the labels
+//   (this could be redundant, but having a explicit bit marker is more efficient)
+
+#include "Bitvector_Plain.hpp"
+#include "Tree.hpp"
+
+#include "Sequence_WaveletTree.hpp"
+
+#include "Tree_LOUDS.hpp"
+#include "Tree_BP.hpp"
+#include "Tree_DFUDS.hpp"
+
+#include <map>
+#include <vector>
+
+namespace compactds {
+
+template <class TreeClass = Tree_DFUDS, class SequenceClass = Sequence_WaveletTree<>, 
+         class SequenceMarkerClass = Bitvector_Plain>
+class Tree_Labeled: public Tree
+{
+private:
+  TreeClass _t ;
+  SequenceClass _l ;
+  SequenceMarkerClass _lmarker ; // markers on l, indicating the start of the labels from a node
+  std::map<size_t, ALPHABET> _lmap ; // mapping label from size_to to ALPHABET
+  std::vector<ALPHABET> _lmapback ; // map labels from ALPAHBET back to original value
+public:
+  size_t GetSpace(bool inclusive = true) 
+  {
+    return _space + _lmap.size() * (sizeof(size_t) + sizeof(ALPHABET)) + _lmapback.capacity() + (inclusive ? sizeof(*this) : 0) ;
+  }
+  
+  // Use c bits per node.
+  void Init(const struct _plainTreeNode *treeNodes, size_t n,
+      size_t *treeIdMap)
+  {
+    size_t i ;
+    _n = n ;
+    _t.Init(treeNodes, n, treeIdMap) ;
+    
+    for (i = 1 ; i < n ; ++i)
+    {
+      if (_lmap.find(treeNodes[i].label) == _lmap.end())
+      {
+        ALPHABET id = _lmap.size() ;
+        _lmap[ treeNodes[i].label ] = id ;
+        _lmapback.push_back( treeNodes[i].label ) ;
+      }
+    }
+
+    std::vector<ALPHABET> alphabetList ;
+    size_t lmapSize = _lmap.size() ;
+    for (i = 0 ; i < lmapSize ; ++i)
+      alphabetList.push_back((ALPHABET)i) ;
+    
+    ALPHABET placeHolder = lmapSize ;
+    alphabetList.push_back(placeHolder) ;
+    ++lmapSize ;
+
+    FixedSizeElemArray childrenLabels ;
+    Alphabet labelAlphabet ;
+    int lmapBits = labelAlphabet.InitFromList(alphabetList.data(), lmapSize) ;
+    childrenLabels.Malloc(lmapBits, 2 * n - 1) ;
+    
+    WORD *W = Utils::MallocByBits(2 * n - 1) ;
+    size_t lused = 0 ;
+    size_t *nodeOrder = (size_t *)malloc(sizeof(*nodeOrder) * n) ;
+    for (i = 0 ; i < n ; ++i)
+      nodeOrder[ treeIdMap[i] ] = i ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      size_t k = nodeOrder[i] ;
+      size_t c = treeNodes[k].child ;
+      
+      Utils::BitSet(W, lused) ; //mark the start of the child series
+      childrenLabels.Write(lused, placeHolder) ;
+      ++lused ;
+      while (c != 0)
+      {
+        childrenLabels.Write(lused, _lmap[treeNodes[c].label]) ;
+        ++lused ;
+
+        c = treeNodes[c].sibling ;
+      }
+    }
+    free(nodeOrder) ;
+
+    _l.SetAlphabet(labelAlphabet) ;
+    _l.Init(childrenLabels, 2 * n - 1, alphabetList.data()) ;
+    _lmarker.Init(W, 2 * n - 1) ;
+
+    free(W) ;
+    
+    _space = _t.GetSpace(false) + _l.GetSpace() - sizeof(_l) + _lmarker.GetSpace() - sizeof(_lmarker) ;
+  }
+
+  // The index in B
+  size_t Root() const
+  {
+    return _t.Root() ;
+  }
+
+  // @return: the t-th child (1-based) of node v in B vector
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    return _t.ChildSelect(v, t) ;
+  }
+
+  size_t FirstChild(size_t v) const 
+  {
+    return _t.FirstChild(v) ;
+  }
+
+  size_t LastChild(size_t v) const 
+  {
+    return _t.LastChild(v) ;
+  }
+  
+  size_t ChildrenCount(size_t v) const
+  {
+    return _t.ChildrenCount(v) ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    return _t.ChildRank(v) ;
+  }
+
+  // The silbing function assumes v has
+  //   those siblings.
+  size_t NextSibling(size_t v) const
+  {
+    return _t.NextSibling(v) ;
+  }
+
+  size_t PrevSibling(size_t v) const
+  {
+    return _t.PrevSibling(v) ;
+  }
+  
+  size_t Parent(size_t v) const
+  {
+    return _t.Parent(v) ;
+  }
+
+  bool IsLeaf(size_t v) const 
+  {
+    return _t.IsLeaf(v) ;
+  }
+
+  size_t LCA(size_t u, size_t v) const
+  {
+    return _t.LCA(u, v) ;
+  }
+  
+  // Maps index in B (v) back up to the actual node id
+  size_t NodeMap(size_t v) const
+  {
+    return _t.NodeMap(v) ;
+  }
+
+  //Map actual node id to index in B (v).
+  size_t NodeSelect(size_t i) const 
+  {
+    return _t.NodeSelect(i) ;
+  }
+
+  // Number of children with label l. 1: has such children. 0-don't 
+  size_t ChildrenLabeled(size_t v, size_t l) const 
+  {
+    size_t i = NodeMap(v) ;
+    ALPHABET lmapped = _lmap.at(l) ;
+    
+    size_t start = _lmarker.Select(i + 1) ;
+    if (i == _n - 1)
+    {
+      return _l.Rank(lmapped, 2 * _n - 2) - _l.Rank(lmapped, start);
+    }
+    else
+    {
+      return _l.Rank(lmapped, _lmarker.Select(i + 2) - 1) - _l.Rank(lmapped, start) ; 
+    }
+  }
+
+  // The t-th child with label l.
+  size_t LabeledChildSelect(size_t v, size_t l, size_t t) const 
+  {
+    size_t i = NodeMap(v) ;
+    ALPHABET lmapped = _lmap.at(l) ;
+    size_t start = _lmarker.Select(i + 1) ;
+
+    size_t childRank = _l.Select(lmapped, _l.Rank(lmapped, start) + t) - start ;
+    return ChildSelect(v, childRank) ;
+  }
+
+  // The label of the edge that leads to node v.
+  size_t ChildLabel(size_t v) const 
+  {
+    if (v == Root())
+      return 0 ;
+
+    size_t childRank = ChildRank(v) ;
+    size_t p = Parent(v) ;
+    return _lmapback.at( _l.Access( _lmarker.Select(NodeMap(p) + 1) + childRank) ) ;
+  }
+
+  void Save(FILE *fp)
+  {
+    Tree::Save(fp) ;
+    _t.Save(fp) ;
+    _l.Save(fp) ;
+    _lmarker.Save(fp) ;
+
+    size_t size = _lmapback.size() ; 
+    SAVE_VAR(fp, size) ;
+    size_t i ;
+    for (i = 0 ; i < size ; ++i) 
+    {
+      SAVE_VAR(fp, _lmapback[i]) ;
+    }
+  }
+
+  void Load(FILE *fp)
+  {
+    Tree::Load(fp) ;
+    _t.Save(fp) ;
+    _l.Load(fp) ;
+    _lmarker.Save(fp) ;
+
+    _lmap.clear() ;
+    _lmapback.clear() ;
+    size_t lmapSize ;
+    LOAD_VAR(fp, lmapSize) ;
+    size_t i ;
+    for (i = 0 ; i < lmapSize ; ++i)
+    {
+      size_t l ;
+      LOAD_VAR(fp, l) ;
+      _lmapback.push_back(l) ;
+      _lmap[l] = i ;
+    }
+  }
+} ;
+
+} // end of name space
+
+#endif
diff --git a/compactds/Tree_Plain.hpp b/compactds/Tree_Plain.hpp
new file mode 100644
index 0000000..a2be8a1
--- /dev/null
+++ b/compactds/Tree_Plain.hpp
@@ -0,0 +1,277 @@
+#ifndef _MOURISL_COMPACTDS_TREE_PLAIN
+#define _MOURISL_COMPACTDS_TREE_PLAIN
+
+#include <vector>
+
+#include "Tree.hpp"
+#include "SimpleVector.hpp"
+
+namespace compactds {
+struct _plainTreeNode
+{
+  size_t parent ;
+  size_t sibling ;
+  size_t child ;
+  size_t lastChild ;
+
+  size_t label ; // the label from the parent to itself
+
+  _plainTreeNode(size_t p, size_t s, size_t c, size_t lc)
+  {
+    parent = p ;
+    sibling = s ;
+    child = c ;
+    lastChild = lc ;
+    label = 0 ;
+  }
+
+  void Save(FILE *fp) 
+  {
+    SAVE_VAR(fp, parent) ;
+    SAVE_VAR(fp, sibling) ;
+    SAVE_VAR(fp, child) ;
+    SAVE_VAR(fp, lastChild) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    LOAD_VAR(fp, parent) ;
+    LOAD_VAR(fp, sibling) ;
+    LOAD_VAR(fp, child) ;
+    LOAD_VAR(fp, lastChild) ;
+  }
+} ;
+
+class Tree_Plain: public Tree 
+{
+private:
+  std::vector<struct _plainTreeNode> _nodes ;
+public:
+  Tree_Plain() {}
+  ~Tree_Plain() {}
+
+  void Init()
+  {
+    _n = 1 ;
+    
+    struct _plainTreeNode node(0, 0, 0, 0) ;
+    _nodes.push_back(node) ;
+  }
+
+  size_t GetSpace(bool inclusive = true)
+  {
+    return _nodes.capacity() * sizeof(struct _plainTreeNode) + (inclusive ? sizeof(*this) : 0) ;
+  }
+
+  // Assumes parent is already in the tree
+  //@return: tree index
+  size_t AddNode(size_t parent)
+  {
+    size_t id = _nodes.size() ; 
+    struct _plainTreeNode node(parent, 0, 0, 0) ;
+    size_t lastSibling = LastChild(parent) ;
+
+    if (lastSibling == 0)
+      _nodes[parent].child = id ;
+    else
+      _nodes[lastSibling].sibling = id ;
+    
+    _nodes[parent].lastChild = id ;
+    _nodes.push_back(node) ;
+    ++_n ;
+
+    return id ;
+  }
+
+  size_t Root() const 
+  {
+    return 0 ;
+  }
+
+  // t-th(1-based) child ;
+  size_t ChildSelect(size_t v, size_t t) const
+  {
+    --t ;
+    
+    size_t c = FirstChild(v) ;
+    size_t i ;
+    for (i = 0 ; i < t ; ++i)
+      c = NextSibling(c) ;
+    return c ;
+  }
+
+  size_t FirstChild(size_t v) const 
+  {
+    return _nodes[v].child ; 
+  }
+
+  size_t LastChild(size_t v) const 
+  {
+    return _nodes[v].lastChild ;
+  }
+
+  size_t ChildrenCount(size_t v) const
+  {
+    size_t i ;
+    size_t c = FirstChild(v) ;
+    for (i = 0 ; c != 0 ; ++i)
+      c = NextSibling(c) ;
+    return i ;
+  }
+  
+  // return: v is the ret-th (1-based) child of the parent.
+  size_t ChildRank(size_t v) const
+  {
+    if (v == Root())
+      return 0 ;
+    size_t c = FirstChild( Parent(v) ) ; 
+    size_t i ;
+    for (i = 0 ; c != v ; ++i)
+      c = NextSibling(c) ;
+    return i + 1 ; // +1: rank is always 1-based
+  }
+
+  size_t NextSibling(size_t v) const 
+  {
+    return _nodes[v].sibling ;
+  }
+
+	size_t PrevSibling(size_t v) const
+	{
+		size_t i ;
+		size_t c = FirstChild( Parent(v) ) ;
+		for (i = 0 ; v != NextSibling(c) ; ++i)
+			c = NextSibling(c) ;
+		return c ;
+	}
+
+  size_t Parent(size_t v) const
+  {
+    return _nodes[v].parent ;
+  }
+
+  bool IsLeaf(size_t v) const
+  {
+    if (_nodes[v].child == 0)
+      return true ;
+    return false ;
+  }
+
+  size_t LCA(size_t u, size_t v) const
+  {
+    SimpleVector<size_t> upath ; 
+    SimpleVector<size_t> vpath ;
+
+    size_t p ;
+    
+    upath.PushBack(u) ;
+    p = Parent(u) ;
+    while (p != 0)
+    {
+      upath.PushBack(p) ;
+      p = Parent(p) ;
+    }
+    upath.PushBack(0) ;
+    
+    vpath.PushBack(v) ;
+    p = Parent(v) ;
+    while (p != 0)
+    {
+      vpath.PushBack(p) ;
+      p = Parent(p) ;
+    }
+    vpath.PushBack(0) ;
+    
+    upath.Reverse() ;
+    vpath.Reverse() ;
+
+    size_t size = MIN(upath.Size(), vpath.Size()) ;
+    size_t i ;
+    for (i = 0 ; i < size; ++i)
+      if (upath[i] != vpath[i])
+        break ;
+    return upath[i - 1] ;
+  }
+
+  size_t NodeMap(size_t v) const 
+  {
+    return v ;
+  }
+
+  size_t NodeSelect(size_t i) const 
+  {
+    return i ;
+  }
+
+  const std::vector<struct _plainTreeNode>& GetTreeData() const
+  {
+    return _nodes ; 
+  }
+
+  void SetLabel(size_t v, size_t l)
+  {
+    _nodes[v].label = l ;
+  }
+
+  // Number of children with label l. 1: has such children. 0-don't 
+  size_t ChildrenLabeled(size_t v, size_t l) const  
+  {
+    size_t ret = 0 ;
+    int c = _nodes[v].child ;
+    while (c != 0)
+    {
+      if (_nodes[c].label == l)
+        ++ret ;
+      c = _nodes[c].sibling ;
+    }
+    return ret ;
+  }
+
+  // The child with label l.
+  size_t LabeledChildSelect(size_t v, size_t l, size_t t) const 
+  {
+    size_t cnt = 0 ;
+    size_t c = _nodes[v].child ;
+    while (c != 0)
+    {
+      if (_nodes[c].label == l)
+        ++cnt ;
+      if (cnt >= t)
+        break ;
+      c = _nodes[c].sibling ;
+    }
+    return c ;
+  }
+
+  // The label of the edge that leads to node v.
+  size_t ChildLabel(size_t v) const 
+  {
+    return _nodes[v].label ;  
+  }
+
+  void Save(FILE *fp) 
+  {
+    Tree::Save(fp) ;
+    size_t i ;
+    for (i = 0 ; i < _n ; ++i)
+      _nodes[i].Save(fp) ;
+  }
+
+  void Load(FILE *fp)
+  {
+    std::vector< struct _plainTreeNode >().swap(_nodes) ;    
+
+    Tree::Load(fp) ;
+    size_t i ;
+    struct _plainTreeNode node(0, 0, 0, 0) ;
+    for (i = 0 ; i < _n ; ++i)
+    {
+      node.Load(fp) ;
+      _nodes.push_back(node) ;
+    }
+  }
+} ;
+
+}
+
+#endif
diff --git a/compactds/UniversalHashGenerator.hpp b/compactds/UniversalHashGenerator.hpp
new file mode 100644
index 0000000..636be0b
--- /dev/null
+++ b/compactds/UniversalHashGenerator.hpp
@@ -0,0 +1,81 @@
+#ifndef _MOURISL_COMPACTDS_UNIVERSALHASHGENERATOR
+#define _MOURISL_COMPACTDS_UNIVERSALHASHGENERATOR
+
+#include "Utils.hpp"
+
+// Universal hash family of ((a*x+b)%p)%m
+
+namespace compactds {
+class UniversalHashGenerator
+{
+private:
+  const uint64_t p ; // The largest prime in 63bit, so 2*p can be in 64bit
+  uint32_t state ;
+  uint64_t m ;
+
+  // Lehmer random generator
+  // https://en.wikipedia.org/wiki/Lehmer_random_number_generator
+  uint32_t Random()
+  {
+    return state = (state * 279470273ull) % 0xfffffffb;
+  }
+
+  // Not really 64 bit due to 0xfffffffb, but close enough
+  uint64_t Random64()
+  {
+    uint32_t lower32 = Random() ;
+    uint32_t upper32 = Random() ;
+    return (upper32 * 0xfffffffbull) + lower32 ; 
+  }
+public:
+  UniversalHashGenerator():p(9223372036854775783ull) {}
+  ~UniversalHashGenerator() {}
+  
+  size_t GetSpace() {return sizeof(*this);}
+
+  // map [0..n] to the range of [0,..,m-1]
+  // @return: the big prime p. 0 if failed
+  uint64_t Init(uint64_t m, uint32_t seed)  
+  {
+    if (seed == 0) 
+      seed = 17 ;
+    state = seed ;
+    this->m = m ;
+
+    return p ;
+  }
+
+  size_t GetP()
+  {
+    return p;
+  }
+  
+  void SetSeed(uint32_t seed)
+  {
+    state = seed ;
+  }
+
+  // Generate a pair of (a, b)
+  void Generate(uint64_t &a, uint64_t &b)
+  {
+    a = Random64() ;
+    if (a == 0)
+    {
+      state = 17 ;
+      a = Random64() ;
+    }
+    b = Random64() ;
+  }
+
+  // Though the outside program should have enough information
+  // to do the mapping on its own, we provide the function here
+  // for convenience.
+  // The function should handle the overflow of a*x
+  uint64_t Map(uint64_t a, uint64_t b, uint64_t x)
+  {
+    return (Utils::SafeMultiMod(x, a, p) + b)%p % m ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/Utils.hpp b/compactds/Utils.hpp
new file mode 100644
index 0000000..e4091c9
--- /dev/null
+++ b/compactds/Utils.hpp
@@ -0,0 +1,292 @@
+#ifndef _MOURISL_COMPACTDS_UTILS
+#define _MOURISL_COMPACTDS_UTILS
+
+#include <fstream>
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <time.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#ifdef __SSE4_2__
+#include <xmmintrin.h>
+#endif
+
+namespace compactds {
+#define WORD_64 // comment this out if word size is 32
+
+#ifdef WORD_64
+  typedef uint64_t WORD ;
+  #define WORDBITS 64
+  #define WORDBYTES 8
+  #define WORDBITS_WIDTH 6 
+#else
+  typedef uint32_t WORD ;
+  #define WORDBITS 32
+  #define WORDBYTES 4
+  #define WORDBITS_WIDTH 6 
+  #define WORDBITS_WIDTH 5
+#endif
+
+#define DIV_CEIL(x,y) (((x)%(y))?((x)/(y)+1):((x)/(y)))
+#define CEIL(x) (((int)(x) == (x))?((int)(x)):((int)(x) + 1))
+#define MIN(x,y) ((x)<=(y)?(x):(y))
+#define MAX(x,y) ((x)<=(y)?(y):(x))
+
+// Create a mask of l 1s
+#define MASK_WCHECK(l) (((l)>=(int)WORDBITS)?(0xffffffffffffffff):((1ull<<(l))-1ull))
+#define MASK(l) ((1ull<<((uint64_t)(l)))-1ull)
+
+// positive infinity
+#define POSITIVE_INF ((uint64_t)-1)
+
+// x-y modules by k-bit block , which are wide words has k bits subblocks 
+// h masks/controls the block size
+// Sebastiano Vigna, Broadword implementation of rank/select queries, 2008 
+#define BITBLOCK_MODDIFF(x,y,h) (((x)|(h)) - ((y)&(~(h)))^(((x)^(~(y))&(h))))
+// Test x<y in a subblock fashion
+#define BITBLOCK_LT(x,y,h) ((((((x)|(h)) - ((y)&(~(h)))) | ((x)^(y)))^((x)|(~(y))))&(h))
+// Test x<= y in a subblock fashion, kind of neg(y<x)
+#define BITBLOCK_LEQ(x,y,h) ((((((y)|(h)) - ((x)&(~(h)))) | ((x)^(y)))^((x)&(~(y))))&(h))
+// Test x>0 in a subblock fashion
+#define BITBLOCK_GZERO(x, l, h) (((((x)|(h))-(l)) | (x)) & (h))
+
+#define SAVE_VAR(fp, x) (fwrite(&(x), sizeof(x), 1, (fp)))
+#define LOAD_VAR(fp, x) (fread(&(x), sizeof(x), 1, (fp)))
+
+#define SAVE_ARR(fp, x, n) (fwrite((x), sizeof(*(x)), (n), (fp)))
+#define LOAD_ARR(fp, x, n) (fread((x), sizeof(*(x)), (n), (fp)))
+
+class Utils
+{
+public:
+  // How many bits in the input x 
+  static int CountBits(WORD x)
+  {
+    int ret = 0 ;
+    for (; x ; x >>= 1)
+      ++ret ;
+    return ret ;
+  }
+  
+  // Count the number of 1's in x.
+  static int Popcount(WORD x)
+  {
+#ifdef __SSE4_2__
+      return __builtin_popcountll(x);
+#else
+#ifdef WORD_64
+      x = x - ((x >> 1) & 0x5555555555555555ull) ;
+      x = (x&0x3333333333333333ull) + ((x>>2)&0x3333333333333333ull) ;
+      return (((x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full) * 0x0101010101010101ull) >> 56 ; 
+#else
+      x = x - ((x >> 1) & 0x55555555) ;
+      x = (x&0x33333333) + ((x>>2)&0x33333333) ;
+      return (((x + (x >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24 ; 
+#endif
+#endif
+    /*else
+    {
+      int ret = 0 ;
+      for (; x ; x &= (x-1))
+        ++ret ;
+      return ret ;
+    }*/
+  }
+
+  // Select the r-th (1-index) 1 in word x
+  static int SelectInWord(WORD x, int r)
+  {
+    const uint64_t l8 = 0x0101010101010101ull ;
+    const uint64_t h8 = l8 << 7ull ;
+    --r ;
+
+    uint64_t s, b, l ;  
+    // Calculate the byte-wise partial sums
+    s = x - ((x & 0xAAAAAAAAAAAAAAAAull) >> 1) ;
+    s = (s & 0x3333333333333333ull) + ((s>>2) & 0x3333333333333333ull) ;
+    s = ((s + (s>>4))&0x0f0f0f0f0f0f0f0full) * l8 ;
+    // Locate the byte
+    // >> 53 is kind of make the byte unit to bit unit (>>56 << 3), makes later shift easier.
+    b = (((BITBLOCK_LEQ(s, r * l8, h8)>>7) * l8) >> 53) & (~7ull) ;
+    l = r - (((s<<8) >> b) & 0xff) ; // update remainder
+    // Seems the 0x804..01ull trick is to expand the bit information into each byte
+    //  each bit in a byte will be in its own byte of a 64bit integer
+    s = (BITBLOCK_GZERO(((x >> b & 0xff) * l8 & 0x8040201008040201ull), l8, h8) >> 7) * l8 ;
+    return b + (((BITBLOCK_LEQ(s, l * l8, h8) >> 7) * l8) >> 56) ;
+
+  }
+
+  // Compute ceil(log2(x)) without float computation
+  static int Log2Ceil(WORD x)
+  {
+    int bcnt = CountBits(x) ;
+    if (x == (1ull<<(bcnt - 1)))
+      return bcnt - 1 ;
+    else
+      return bcnt ;
+  }
+
+  // The power function in the integer space
+  static uint64_t PowerInt(int x, int y)
+  {
+    uint64_t ret = 1 ;
+    uint64_t powerx = x ;
+    while (y)
+    {
+      if (y & 1)
+        ret *= powerx ;
+      powerx *= powerx ;
+      y >>= 1 ;
+    }
+    return ret ;
+  }
+
+  // The multiplication then take mode: (a*b)%m 
+  // that make sure a*b not overflow
+  static uint64_t SafeMultiMod(uint64_t a, uint64_t b, uint64_t m)
+  {
+    uint64_t ret = 0 ;
+    a %= m ;
+    while (b)
+    {
+      if (b & 1)
+        ret += a%m ;
+      a = (a * 2)%m ;
+      b >>= 1 ;
+    }
+    return ret ;
+  }
+  
+  // Assuming only span two words at most.
+  //  s is j', e is j
+  // Get B[s..e]
+  static WORD BitsRead(const WORD *W, const size_t s, const size_t e) 
+  {
+    // In practice we should let other part be correct about this
+    //if (s > e)   
+    //  return 0 ;
+
+    const size_t ie = e >> WORDBITS_WIDTH ; // index for e
+    const size_t is = s >> WORDBITS_WIDTH ;
+
+    const int rs = s & (WORDBITS - 1) ;
+    
+    if (ie == is)
+    {
+      // in the same block
+      return (W[ie] >> rs) & MASK_WCHECK(e-s+1) ;
+    }
+    else
+    {
+      const int re = e & (WORDBITS - 1) ;// e%w, the residual offset within a word
+      // Since ie!=is, re must be less than 63, so we don't need to check the MASK.
+      return (W[is] >> rs) | ((W[ie] & MASK(re + 1)) << (WORDBITS - rs)) ;
+    }
+  }
+
+  // Write B[s..e]=x. 
+  static void BitsWrite(WORD *W, size_t s, size_t e, WORD x) 
+  {
+    if (s > e)
+      return ;
+    const int w = sizeof(WORD) * 8 ;
+    int re = e & (w - 1) ;// e%w, the residual offset within a word
+    int rs = s & (w - 1) ;
+
+    size_t ie = e/w ; // index for e
+    size_t is = s/w ;
+
+    if (ie == is)
+    {
+      W[ie] = (W[ie] & ~(MASK_WCHECK(e-s+1) << rs)) | ((WORD)x<<rs); // masking | adding
+    }
+    else
+    {
+      W[is] = (W[is] & MASK(rs)) | ((WORD)x << rs) ;
+      W[ie] = (W[ie] & ~MASK(re + 1)) | (x >> (w-rs)) ;
+    }
+  }
+
+  static int BitRead(const WORD *W, size_t i) 
+  {
+    return (W[i>>WORDBITS_WIDTH] >> (i&(WORDBITS-1)))&1ull ;
+  }
+
+  static void BitSet(WORD *W, size_t i)
+  {
+    W[i>>WORDBITS_WIDTH] |= (1ull << (i&(WORDBITS-1))) ; 
+  }
+  
+  static void BitFlip(WORD *W, size_t i)
+  {
+    W[i>>WORDBITS_WIDTH] ^= (1ull << (i&(WORDBITS-1))) ; 
+  }
+  
+  static void BitClear(WORD *W, size_t i)
+  {
+    if (BitRead(W, i))
+      W[i>>WORDBITS_WIDTH] -= (1ull << (i&(WORDBITS-1))) ; 
+  }
+  
+  static size_t BitsToWordBytes(size_t l)
+  {
+    return sizeof(WORD) * DIV_CEIL(l, sizeof(WORD)*8) ;
+  }
+
+  static size_t BitsToWords(size_t l)
+  {
+    return DIV_CEIL(l, sizeof(WORD)*8) ;
+  }
+
+  static WORD *MallocByBits(size_t l)
+  {
+    return (WORD *)calloc(BitsToWords(l), sizeof(WORD)) ;
+  }
+  
+  // Translate the space usage description (TB, GB, MB, KB) to bytes
+  static size_t SpaceStringToBytes(const char *s) 
+  {
+    int i ;
+    size_t ret = 0 ;
+    for (i = 0 ; s[i] >= '0' && s[i] <= '9' ; ++i)
+      ret = ret * 10 + s[i] - '0' ;
+
+    switch (s[i])
+    {
+      case 'T':
+      case 't':
+        ret *= 1000000000000 ; break ;
+      case 'G':
+      case 'g':
+        ret *= 1000000000 ; break ;
+      case 'M':
+      case 'm':
+        ret *= 1000000 ; break ;
+      case 'K':
+      case 'k':
+        ret *= 1000 ; break ;
+    }
+
+    return ret ;
+  }
+
+  static void PrintLog( const char *fmt, ... )
+  {
+    va_list args ;
+    va_start( args, fmt ) ;
+    char buffer[500] ;
+    vsprintf( buffer, fmt, args ) ;
+
+    time_t mytime = time(NULL) ;
+    struct tm *localT = localtime( &mytime ) ;
+    char stime[500] ;
+    strftime( stime, sizeof( stime ), "%c", localT ) ;
+    fprintf( stderr, "[%s] %s\n", stime, buffer ) ;
+  }
+} ;
+}
+#endif
diff --git a/compactds/VariableSizeElemArray.hpp b/compactds/VariableSizeElemArray.hpp
new file mode 100644
index 0000000..b67d0f9
--- /dev/null
+++ b/compactds/VariableSizeElemArray.hpp
@@ -0,0 +1,33 @@
+#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY
+#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY
+
+#include <vector>
+
+#include "Utils.hpp"
+#include "FixedSizeElemArray.hpp"
+
+/*
+ * The class for the array where each element has variable size
+ */
+namespace compactds {
+class VariableSizeElemArray
+{
+public:
+  VariableSizeElemArray() {}
+
+  ~VariableSizeElemArray() {}
+
+  virtual void Free() = 0;
+  
+  // Create the variable size element array  
+  // b - block size
+  // in - input array
+  // n - the length of input array
+  // 
+  virtual void InitFromArray(int b, const unsigned int *in, const size_t &n) = 0 ;
+
+  virtual unsigned int Read(size_t i) = 0 ;
+} ;
+}
+
+#endif
diff --git a/compactds/VariableSizeElemArray_DensePointers.hpp b/compactds/VariableSizeElemArray_DensePointers.hpp
new file mode 100644
index 0000000..0984534
--- /dev/null
+++ b/compactds/VariableSizeElemArray_DensePointers.hpp
@@ -0,0 +1,144 @@
+#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DENSEPOINTERS
+#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DENSEPOINTERS
+
+#include <vector>
+
+#include "Utils.hpp"
+#include "FixedSizeElemArray.hpp"
+
+#include "VariableSizeElemArray.hpp"
+
+/*
+ * The class for the array where each element has variable size
+ * Implement with dense pointers for constant time access (Section 3.2.2)
+ */
+namespace compactds {
+class VariableSizeElemArray_DensePointers: public VariableSizeElemArray
+{
+private:
+  WORD *M ; // the compressed data
+  size_t *P ; // sampled pointer 
+  FixedSizeElemArray offsets ; // the offset within each block
+  int b ;
+  size_t n ;
+  int lastPosInM ; // the last position used in M 
+  int space ;
+public:
+  VariableSizeElemArray_DensePointers() 
+  {
+    M = NULL ;
+    P = NULL ;
+    space = 0 ;
+  }
+
+  ~VariableSizeElemArray_DensePointers() 
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (M != NULL)
+      free(M) ;
+    if (P != NULL)
+      free(P) ;
+    offsets.Free() ;
+    M = NULL ; 
+    P = NULL ;
+  }
+  
+  // Create the variable size element array  
+  // b - block size. has to be > 1
+  // in - input array (non-negative)
+  // n - the length of input array
+  void InitFromArray(int blockSize, const unsigned int *in, const size_t &n) 
+  {
+    size_t i, j ;
+    int maxL = 0 ;  
+    size_t totalL = 0 ;
+    this->n = n ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      int bcnt = Utils::CountBits(in[i] + 1) ; // need to shift 1 to allow 0. 
+      if (bcnt > maxL) 
+        maxL = bcnt ;
+      totalL += bcnt ;
+    }
+    
+    b = blockSize ;
+    
+    if (b <= 1)
+    {
+      b = CEIL(sizeof(WORD) * 8 * log(2)) ; // TODO: automatic block size determination
+    }
+
+    size_t blockCnt = DIV_CEIL(n, b) ;
+    P = (size_t *)malloc(blockCnt * sizeof(size_t)) ;
+    offsets.Malloc( Utils::Log2Ceil( (b - 1) * (double)(maxL-1) ), n - blockCnt) ; 
+    space = blockCnt * sizeof(size_t) + offsets.GetSpace() - sizeof(offsets);
+
+    M = Utils::MallocByBits(totalL - n) ; // We don't store the highest bit so -n
+    space += Utils::BitsToWordBytes(totalL - n) ;
+
+    // Encode the data
+    size_t sumL = 0 ;
+    size_t withinOffset = 0 ;
+    // i for indexing the input array, j for indexing the offsets array
+    for (i = 0, j = 0 ; i < n ; ++i)
+    {
+      int bcnt = Utils::CountBits(in[i] + 1) ;
+      if (i % b == 0)
+      {
+        P[i/b] = sumL ;
+        withinOffset = 0 ;      
+      }
+      else // Only store the offet of the first element
+      {
+        offsets.Write(j, withinOffset) ;
+        ++j ;
+      }
+      if (in[i] == 0)
+        continue ;
+     
+      Utils::BitsWrite(M, sumL, sumL + bcnt - 1 - 1, (in[i] + 1) & MASK(bcnt - 1)) ;
+      //if (j > 0) printf("i=%d j=%d: in[i]=%d bcnt-1=%d. sumL=%d withinOffset=%d. offsets[j]=%d |elem|=%d\n", i, j, in[i], bcnt - 1, sumL, withinOffset, offsets.Read(j - 1), offsets.GetElemLength());
+      sumL += bcnt - 1 ;
+      withinOffset += bcnt - 1 ;
+    }
+    lastPosInM = sumL - 1 ; 
+  }
+
+  unsigned int Read(size_t i) 
+  {
+    int pi = i / b ; // index in P
+    int presidual = i % b ;
+    int nextpi = (i + 1) / b ;
+    size_t ms, me ; // start and end in M.
+    
+    ms = P[pi] ;
+    if (presidual > 0)
+      ms = P[pi] + offsets.Read(i - pi - 1) ; // -pi because each block skip one elemtn in offsets
+    if (i + 1 < n)
+    {
+      if (pi == nextpi) 
+        me = P[pi] + offsets.Read(i + 1 - pi - 1) - 1;
+      else
+        me = P[nextpi] - 1 ;
+    }
+    else
+      me = lastPosInM ; 
+
+    //printf("\ni=%d: ms=%d me=%d. pi=%d offset=%d\n", i, ms, me, pi, offsets.Read(i - pi)) ;
+    if (ms > me || (i == 0 && me == (size_t)-1)) 
+      return 0 ;
+   return (Utils::BitsRead(M, ms, me) | (1<<(me - ms + 1))) - 1;
+  }
+
+  int GetSpace()
+  {
+    return space + sizeof(*this) ;  
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/VariableSizeElemArray_DirectAccess.hpp b/compactds/VariableSizeElemArray_DirectAccess.hpp
new file mode 100644
index 0000000..b2dcca6
--- /dev/null
+++ b/compactds/VariableSizeElemArray_DirectAccess.hpp
@@ -0,0 +1,76 @@
+#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DIRECTACCESS
+#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DIRECTACCESS
+
+#include <vector>
+
+#include "Utils.hpp"
+#include "Encode.hpp"
+#include "FixedSizeElemArray.hpp"
+#include "VariableSizeElemArray.hpp"
+
+
+/*
+ * The class for the array where each element has variable size
+ */
+namespace compactds {
+class VariableSizeElemArray_DirectAccess : public VariableSizeElemArray
+{
+private:
+  WORD **M ; // mark whether this is the last piece  
+  WORD **P ; // the piece of block size b
+ 
+  int b ; // block size
+  int levelCnt ; // the number of dimensions for M and P
+public:
+  VariableSizeElemArray() 
+  {
+  }
+
+  ~VariableSizeElemArray()
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+  }
+  
+  // Create the variable size element array  
+  // b - block size
+  // in - input array
+  // n - the length of input array
+  // 
+  void InitFromArray(int blockSize, const unsigned int *in, const size_t &n)
+  {
+    int totalL = 0 ; // total bit length
+    int maxL = 0 ; 
+    int i ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      int bcnt = Utils::CountBits(in[i]) ;
+      totalL += bcnt ;
+      if (bcnt > maxL)
+        maxL = bcnt ;
+    }
+
+    if (b <= 0)
+      b = log(n) / log(2) ; //TODO: check 
+    levelCount = DIV_CEIL(maxL, b) ;
+   
+    M = (WORD *)malloc(sizeof(WORD *) * levelCount) ;
+    P = (WORD *)malloc(sizeof(WORD *) * levelCount) ;
+
+    for (i = 0 ; i < n ; ++i)
+    {
+          
+    }
+  }
+
+  unsigned int Read(int i)
+  {
+    return 0 ;
+  }
+} ;
+}
+
+#endif
diff --git a/compactds/VariableSizeElemArray_SampledPointers.hpp b/compactds/VariableSizeElemArray_SampledPointers.hpp
new file mode 100644
index 0000000..5218abd
--- /dev/null
+++ b/compactds/VariableSizeElemArray_SampledPointers.hpp
@@ -0,0 +1,118 @@
+#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_SAMPLEDPOINTERS
+#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_SAMPLEDPOINTERS
+
+#include <vector>
+
+#include "Utils.hpp"
+#include "EliasCode.hpp"
+#include "FixedSizeElemArray.hpp"
+
+#include "VariableSizeElemArray.hpp"
+
+/*
+ * The class for the array where each element has variable size
+ * Implement with sampled pointers (Section 3.2.1)
+ */
+namespace compactds {
+class VariableSizeElemArray_SampledPointers: public VariableSizeElemArray
+{
+private:
+  WORD *M ; // the compressed data
+  size_t *P ; // sampled pointer 
+  int b ;
+  
+  int space ;
+public:
+  VariableSizeElemArray_SampledPointers() 
+  {
+    M = NULL ;
+    P = NULL ;
+    space = 0 ;
+  }
+
+  ~VariableSizeElemArray_SampledPointers() 
+  {
+    Free() ;
+  }
+
+  void Free()
+  {
+    if (M != NULL)
+      free(M) ;
+    if (P != NULL)
+      free(P) ;
+    M = NULL ;
+    P = NULL ;
+  }
+  
+  // Create the variable size element array  
+  // b - block size. has to be > 1
+  // in - input array
+  // n - the length of input array
+  void InitFromArray(int blockSize, const unsigned int *in, const size_t &n) 
+  {
+    size_t i ;
+    size_t totalEncodeBits = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      int bcnt = Utils::CountBits(in[i] + 1) ; // need to shift 1 to allow 0. 
+      totalEncodeBits += 2 * bcnt - 1 ; // we use gamma encoding because the number of bits for each number is less than 32 in general, which makes it more efficient than delta encoding 
+    }
+    
+    b = blockSize ;
+    
+    if (b <= 1)
+    {
+      b = sizeof(WORD) * 8 ; // extra overhead 1 bit per element 
+    }
+
+    size_t blockCnt = DIV_CEIL(n, b) ;
+    P = (size_t *)malloc(blockCnt * sizeof(size_t)) ;
+    space = blockCnt * sizeof(size_t) ;
+    
+    M = Utils::MallocByBits(totalEncodeBits) ; // We don't store the highest bit so -n
+    space += Utils::BitsToWordBytes(totalEncodeBits) ;
+
+    // Encode the data
+    size_t sumL = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (i % b == 0)
+      {
+        P[i/b] = sumL ;
+      }
+
+      int l ; 
+      WORD x = EliasCode::Gamma(in[i] + 1, l) ;
+      Utils::BitsWrite(M, sumL, sumL + l - 1, x) ;
+
+      //int tmpl ;
+      //printf("i=%d: in[i]=%d encode=%lld l=%d. sumL=%d. decode=%d\n", i, in[i], x, l, sumL,
+      //    EliasCode::ReadOneGamma(M, sumL, tmpl));
+      sumL += l ;
+    }
+  }
+
+  unsigned int Read(size_t i) 
+  {
+    size_t pi = i / b ; // index in P
+    size_t j = pi * b ; 
+    size_t offset = P[pi] ;
+    int ret = 1 ;
+    int l ;
+    for (; j <= i ; ++j)
+    {
+      ret = EliasCode::ReadOneGamma(M, offset, l) ;
+      offset += l ;
+    }
+
+    return ret - 1;
+  }
+
+  int GetSpace()
+  {
+    return space + sizeof(*this) ;  
+  }
+} ;
+}
+#endif
diff --git a/compactds/bitvector_benchmark.cpp b/compactds/bitvector_benchmark.cpp
new file mode 100644
index 0000000..7b42638
--- /dev/null
+++ b/compactds/bitvector_benchmark.cpp
@@ -0,0 +1,162 @@
+#include <iostream>
+#include <random>
+#include <vector>
+#include <chrono>
+#include <functional>
+#include "Bitvector_Plain.hpp"
+#include "Bitvector_Sparse.hpp"
+#include "DS_Select_Test.hpp"
+
+using namespace std ;
+using namespace std::chrono ;
+using timer = std::chrono::high_resolution_clock;
+using namespace compactds ; 
+
+const int n = 800000000 ;
+const int reps = 10000000 ;
+
+void set_random_bits(std::vector<size_t> &v, int seed)
+{
+  std::mt19937_64 rng;
+  if (0 == seed) {
+    rng.seed(std::chrono::system_clock::now().time_since_epoch().count());
+  } else
+    rng.seed(seed);
+  
+  size_t *data = v.data() ;
+  size_t size = v.size() ;
+  *data = rng();
+  for (size_t i=1; i < size; ++i) {
+    *(++data) = rng();
+  }
+}
+
+std::vector<size_t> rnd_positions(uint8_t log_s, uint64_t& mask, uint64_t mod=0, uint64_t seed=17)
+{
+  mask = (1<<log_s)-1;
+  std::vector<size_t> rands(1<<log_s ,0);
+  set_random_bits(rands, seed);
+  if (mod > 0) {
+    size_t i ;
+    size_t size = rands.size() ;
+    for (i = 0 ; i < size ; ++i)
+      rands[i] %= mod ;
+  }
+  return rands;
+}
+
+
+int main(int argc, char *argv[])
+{
+  size_t i ;
+  auto start = timer::now();
+  Bitvector_Plain bv ;
+  WORD *b = Utils::MallocByBits(n) ;
+
+  std::mt19937_64 rng;
+  std::uniform_int_distribution<uint64_t> distribution(0, n-1);
+  auto dice = bind(distribution, rng);
+  
+  // populate vectors with some other bits
+  for (i=0; i < n/25; ++i) {
+    uint64_t x = dice();
+    Utils::BitSet(b, x) ;
+  }
+  auto stop = timer::now();
+  cout << "initialization in (ms): " << duration_cast<milliseconds>(stop-start).count() << endl;
+
+  cout << "size in byptes: " << Utils::BitsToWordBytes(n) << endl;
+  
+  start = timer::now();
+  bv.SetSelectTypeSupport(3) ;
+  if (argc == 1)
+    bv.SetSelectSpeed(3) ;
+  else
+    bv.SetSelectSpeed(atoi(argv[1])) ;
+  DS_Rank9 ranktst ;
+  bv.Init(b, n) ;
+  ranktst.Init(b, n) ;
+  DS_Select_Test selectTst ;
+  selectTst.Init(0, b, n, 2, 3) ;
+  stop = timer::now() ;
+  cout << "construction in (ms): " << duration_cast<milliseconds>(stop-start).count() << endl;
+  cout << "size in bytes: " << bv.GetSpace() << endl;
+
+  auto ones = bv.Rank(1, n) ; 
+  auto zeros = n-ones;
+  if (0)
+  {
+    uint64_t mask = 0;
+    
+    auto rands = rnd_positions(20, mask, zeros, 17);
+    for (uint64_t i=0; i<rands.size(); ++i) rands[i] = rands[i]+1;
+    start = timer::now();
+    uint64_t check = 0 ;
+    for (i = 0 ; i < reps ; ++i)
+      check += bv.Select(0, rands[i&mask]) ;
+    stop = timer::now();
+
+    cout << "# select0_time = " << duration_cast<nanoseconds>(stop-start).count()/(double)reps << endl;
+    cout << "# select0_check = " << check << endl;
+  }
+  if (1)
+  {
+    uint64_t mask = 0;
+    
+    auto rands = rnd_positions(20, mask, ones, 17);
+    for (uint64_t i=0; i<rands.size(); ++i) rands[i] = rands[i]+1;
+    start = timer::now();
+    uint64_t check = 0 ;
+    for (i = 0 ; i < reps ; ++i)
+      check += bv.Select(1, rands[i&mask]) ;
+    stop = timer::now();
+
+    cout << "# select1_time = " << duration_cast<nanoseconds>(stop-start).count()/(double)reps << endl;
+    cout << "# select1_check = " << check << endl;
+  }
+  if (1)
+  {
+    uint64_t mask = 0;
+    
+    auto rands = rnd_positions(20, mask, ones, 17);
+    for (uint64_t i=0; i<rands.size(); ++i) rands[i] = rands[i]+1;
+    start = timer::now();
+    uint64_t check = 0 ;
+    for (i = 0 ; i < reps ; ++i)
+      check += selectTst.Query(rands[i&mask], ranktst, b, n) ;
+    stop = timer::now();
+
+    cout << "# select1_time = " << duration_cast<nanoseconds>(stop-start).count()/(double)reps << endl;
+    cout << "# select1_check = " << check << endl;
+    cout << "# select1_space = " << selectTst.GetSpace() << endl;
+  }
+  {
+    uint64_t mask = 0;
+    
+    auto rands = rnd_positions(20, mask, n, 17);
+    start = timer::now();
+    uint64_t check = 0 ;
+    for (i = 0 ; i < reps ; ++i)
+      check += bv.Rank(1, rands[i&mask]) ;
+    stop = timer::now();
+
+    cout << "# rank1_time = " << duration_cast<nanoseconds>(stop-start).count()/(double)reps << endl;
+    cout << "# rank1_check = " << check << endl;
+  }
+  
+  {
+    uint64_t mask = 0;
+    
+    auto rands = rnd_positions(20, mask, n, 17);
+    start = timer::now();
+    uint64_t check = 0 ;
+    for (i = 0 ; i < reps ; ++i)
+      check += ranktst.Query(rands[i&mask], b, n) ;
+    stop = timer::now();
+
+    cout << "# rank1_time = " << duration_cast<nanoseconds>(stop-start).count()/(double)reps << endl;
+    cout << "# rank1_check = " << check << endl;
+    cout << "# rank1_test_space = " << ranktst.GetSpace() << endl;
+  }
+  return 0 ;
+}
diff --git a/compactds/notes.md b/compactds/notes.md
new file mode 100644
index 0000000..21d7b2b
--- /dev/null
+++ b/compactds/notes.md
@@ -0,0 +1,7 @@
+Init(...) or InitFromAbc(...) is the real intializer for the class. We can use SetXXX parameter to set other optional parameters. 
+
+FixedSizeElemArray is the building block for many data structures. It assumes the element size is no more than 32 bits.
+
+DS_Abc class means when actual query requires the data source.
+
+Xxx_Yyy means Yyy is the children class of Xxx, except when Xxx=="DS".
diff --git a/compactds/rbbwt.cpp b/compactds/rbbwt.cpp
new file mode 100644
index 0000000..c089faa
--- /dev/null
+++ b/compactds/rbbwt.cpp
@@ -0,0 +1,148 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <stdarg.h>
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+
+#include "SequenceCompactor.hpp"
+#include "Sequence_WaveletTree.hpp"
+#include "Sequence_RunLength.hpp"
+#include "Sequence_Hybrid.hpp"
+#include "Sequence_RunBlock.hpp"
+#include "FMBuilder.hpp"
+#include "FMIndex.hpp"
+
+// Usage: ./a.out (sequence_file|bwt) [load]
+using namespace std::chrono ;
+using timer = std::chrono::high_resolution_clock;
+
+using namespace compactds ;
+
+int main(int argc, char *argv[])
+{
+  std::string seq ;
+  FixedSizeElemArray s ;
+  
+  char abList[] = "ACGT" ;
+  FixedSizeElemArray BWT ;
+  size_t n = 0 ;
+  const size_t maxTestCnt = 10000000 ;
+
+  if (atoi(argv[2]) == 0 || argc <= 2)
+  {
+    std::ifstream ifs(argv[1], std::ifstream::in) ;
+    std::getline(ifs, seq) ;
+    SequenceCompactor seqCompactor ;
+    seqCompactor.Init(abList, s, 1000000) ;
+    seqCompactor.Compact(seq.c_str(), s) ;
+
+    n = s.GetSize() ;
+    struct _FMBuilderParam param ;
+    struct _FMIndexAuxData fmAuxData ;
+    param.threadCnt = 4 ;
+    param.saBlockSize = n / param.threadCnt ;
+    
+    FMBuilder::InferParametersGivenMemory(n, strlen(abList), Utils::SpaceStringToBytes("24G"), param) ;
+    size_t firstISA = 0 ;
+    FMBuilder::Build(s, n, strlen(abList),
+        BWT, firstISA, param) ;
+    param.Free() ;
+    FILE *fp = fopen("tmp.idx", "w") ;
+    BWT.Save(fp) ;
+    fclose(fp) ;
+  }
+  else
+  {
+    FILE *fp = fopen(argv[1], "r") ;
+    BWT.Load(fp) ;
+    fclose(fp) ;
+    
+    n = BWT.GetSize() ;
+  }
+  printf("Total size: %lu\n", n) ;
+  
+  {
+    Sequence_WaveletTree<> plbwt ; // plain bwt
+    plbwt.SetSelectSpeed(0) ;
+    plbwt.Init(BWT, n, abList) ;
+    printf("Plain bwt space (bytes): %lu\n", plbwt.GetSpace()) ;
+  
+    auto start = timer::now();
+    size_t check = 0 ;
+    size_t i ;
+    for (i = 0 ; i < n && i < maxTestCnt ; ++i)
+    {
+      size_t x = plbwt.Rank('A', i) ;
+      check += x ;
+    }
+    auto stop = timer::now();
+    std::cout << "# rank time (ns) from " << i << "  = " << duration_cast<nanoseconds>(stop-start).count()/(double)i << std::endl;
+    std::cout << "# rank sum = " << check << std::endl;
+  }
+  
+  {
+    Sequence_RunLength rlbwt ;
+    rlbwt.Init(BWT, n, abList) ;
+    rlbwt.PrintStats() ;
+    printf("Runlength bwt space (bytes): %lu\n", rlbwt.GetSpace()) ;
+    
+    auto start = timer::now();
+    size_t check = 0 ;
+    size_t i ;
+    for (i = 0 ; i < n && i < maxTestCnt ; ++i)
+    {
+      size_t x = rlbwt.Rank('A', i) ;
+      check += x ;
+    }
+    auto stop = timer::now();
+    std::cout << "# rank time (ns) from " << i << "  = " << duration_cast<nanoseconds>(stop-start).count()/(double)i << std::endl;
+    std::cout << "# rank sum = " << check << std::endl;
+  }
+  
+  if (1) 
+  {
+    Sequence_Hybrid hybbwt ;
+    //hybbwt.SetBlockSize(8) ;
+    hybbwt.Init(BWT, n, abList) ;
+    hybbwt.PrintStats() ;
+    printf("Hybrid bwt space (bytes): %lu\n", hybbwt.GetSpace()) ;
+
+    auto start = timer::now();
+    size_t check = 0 ;
+    size_t i ;
+    for (i = 0 ; i < n && i < maxTestCnt ; ++i)
+    {
+      size_t x = hybbwt.Rank('A', i) ;
+      check += x ;
+    }
+    auto stop = timer::now();
+    std::cout << "# rank time (ns) from " << i << "  = " << duration_cast<nanoseconds>(stop-start).count()/(double)i << std::endl;
+    std::cout << "# rank sum = " << check << std::endl;
+  }
+
+  {
+    Sequence_RunBlock rbbwt ;
+    //rbbwt.SetBlockSize(5) ;
+    rbbwt.Init(BWT, n, abList) ;
+    rbbwt.PrintStats() ;
+    printf("RunBlock bwt space (bytes): %lu\n", rbbwt.GetSpace()) ;
+
+    auto start = timer::now();
+    size_t check = 0 ;
+    size_t i ;
+    for (i = 0 ; i < n && i < maxTestCnt ; ++i)
+    {
+      size_t x = rbbwt.Rank('A', i) ;
+      check += x ;
+    }
+    auto stop = timer::now();
+    std::cout << "# rank time (ns) from " << i << "  = " << duration_cast<nanoseconds>(stop-start).count()/(double)i << std::endl;
+    std::cout << "# rank sum = " << check << std::endl;
+  } 
+
+  return 0 ;
+}
diff --git a/compactds/test.cpp b/compactds/test.cpp
new file mode 100644
index 0000000..cfa29ab
--- /dev/null
+++ b/compactds/test.cpp
@@ -0,0 +1,1881 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <stdarg.h>
+
+#include "FixedSizeElemArray.hpp"
+#include "FractionBitElemArray.hpp"
+#include "VariableSizeElemArray_SampledPointers.hpp"
+#include "VariableSizeElemArray_DensePointers.hpp"
+#include "InterleavedFixedSizeElemArray.hpp"
+
+#include "Bitvector_Plain.hpp"
+#include "Bitvector_Compressed.hpp"
+#include "Bitvector_Sparse.hpp"
+#include "Bitvector_RunLength.hpp"
+
+#include "Sequence_Plain.hpp"
+#include "Sequence_WaveletTree.hpp"
+#include "Sequence_RunLength.hpp"
+#include "Sequence_Hybrid.hpp"
+#include "Sequence_RunBlock.hpp"
+
+#include "PerfectHash.hpp"
+#include "PartialSum.hpp"
+
+#include "SuffixArrayGenerator.hpp"
+#include "FMBuilder.hpp"
+#include "FMIndex.hpp"
+
+#include "DS_InvPermutation.hpp"
+#include "Permutation.hpp"
+#include "InvertedIndex.hpp"
+
+#include "DS_Parenthesis.hpp"
+#include "DS_PatternRankSelect.hpp"
+
+#include "Tree_Plain.hpp"
+#include "Tree_LOUDS.hpp"
+#include "Tree_BP.hpp"
+#include "Tree_DFUDS.hpp"
+
+#include "Tree_Cardinal_Plain.hpp"
+#include "Tree_Cardinal_LOUDS.hpp"
+#include "Tree_Cardinal_Ordinal.hpp"
+
+#include "Tree_Labeled.hpp"
+
+using namespace compactds ; 
+
+void PrintLog( const char *fmt, ... )
+{
+	va_list args ;
+	va_start( args, fmt ) ;
+	char buffer[500] ;
+  vsprintf( buffer, fmt, args ) ;
+
+	time_t mytime = time(NULL) ;
+	struct tm *localT = localtime( &mytime ) ;
+	char stime[500] ;
+	strftime( stime, sizeof( stime ), "%c", localT ) ;
+	fprintf( stderr, "[%s] %s\n", stime, buffer ) ;
+}
+
+int main(int argc, char *argv[])
+{
+  if (argc < 2)
+  {
+    fprintf(stderr, "Usage: ./test test_case\n") ;
+    exit(1) ;
+  }
+
+  size_t i ;
+  unsigned int mismatchCnt = 0 ;
+  //int array[] = {20, 18, 22, 22, 16, 21, 11, 22, 21, 21, 5, 7, 31, 0, 3} ;
+  //int array[] = {0xfffffff} ;
+  if (!strcmp(argv[1], "array"))
+  {
+    /*int array[] = {0, 0xfff, 0, 1, 2, 3, 4, 5, 6, 8, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} ;*/
+    //int array[] = {0, 1, 2} ;
+    const int n = 1000 ;
+    unsigned int array[n] ;
+    for (i = 0 ; i < n ; ++i)
+      array[i] = (i * 7 + 3)%3 ; // trits
+    unsigned int len = sizeof(array) / sizeof(array[0]) ;
+    printf("Raw size: %d\n\n", (int)sizeof(array)) ;
+
+    {
+      FixedSizeElemArray fsea ;
+      //B.Malloc(5, len) ;
+      //for (i = 0 ; i < len ; ++i)
+      //  B.Write(i, array[i]) ;
+      fsea.InitFromArray(-1, array, len) ;
+      mismatchCnt = 0 ;
+      printf("Fixed-size element array:\n") ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (fsea.Read(i) != array[i])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (bytes): %d\n", (int)fsea.GetSpace());
+    }
+    
+    {
+      FixedSizeElemArray fsea ;
+      //B.Malloc(5, len) ;
+      //for (i = 0 ; i < len ; ++i)
+      //  B.Write(i, array[i]) ;
+      fsea.InitFromArray(-1, array, len) ;
+      
+      FILE *fp = fopen("tmp.out", "w") ;
+      fsea.Save(fp) ;
+      fclose(fp) ;
+      
+      fp = fopen("tmp.out", "r") ;
+      fsea.Load(fp) ;
+      fclose(fp) ;
+      
+      mismatchCnt = 0 ;
+      printf("\nFixed-size element array load/save:\n") ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (fsea.Read(i) != array[i])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (bytes): %d\n", (int)fsea.GetSpace());
+    }
+    
+    {
+      FixedSizeElemArray fsea ;
+      //B.Malloc(5, len) ;
+      //for (i = 0 ; i < len ; ++i)
+      //  B.Write(i, array[i]) ;
+      fsea.Malloc(2, 0);
+      fsea.Reserve(5);
+      for (i = 0 ; i < len ; ++i)
+        fsea.PushBack(array[i]);
+      mismatchCnt = 0 ;
+      printf("\nFixed-size element array with push back:\n") ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (fsea.Read(i) != array[i])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (bytes): %d\n", (int)fsea.GetSpace());
+    }
+
+    FractionBitElemArray fbea ;
+    fbea.InitFromArray(0, array, len) ;
+    printf("\nFraction bits element array:\n") ;
+    mismatchCnt = 0 ;
+    printf("Fixed-size element array:\n") ;
+    for (i = 0 ; i < len ; ++i)
+    {
+      if (fbea.Read(i) != array[i])
+        ++mismatchCnt ;
+    }
+    printf("mismatch count: %d\n", mismatchCnt) ;
+    printf("Space usage (bytes): %d\n", (int)fbea.GetSpace());
+
+    int blockSize = -1 ;
+    VariableSizeElemArray_SampledPointers vseasp ;
+    vseasp.InitFromArray(blockSize, array, len) ;
+    printf("\nSampled pointers:\n") ;
+    mismatchCnt = 0 ;
+    for (i = 0 ; i < len ; ++i)
+    {
+      if (vseasp.Read(i) != array[i])
+        ++mismatchCnt ;
+    }
+    printf("mismatch count: %d\n", mismatchCnt) ;
+    printf("Space usage (bytes): %d\n", (int)vseasp.GetSpace());
+
+    VariableSizeElemArray_DensePointers vseadp ;
+    vseadp.InitFromArray(blockSize, array, len) ;
+    printf("\nDense pointers:\n") ;
+    mismatchCnt = 0 ;
+    for (i = 0 ; i < len ; ++i)
+    {
+      if (vseadp.Read(i) != array[i])
+        ++mismatchCnt ;
+    }
+    printf("mismatch count: %d\n", mismatchCnt) ;
+    printf("Space usage (bytes): %d\n", (int)vseadp.GetSpace());
+
+    {
+      printf("\nInterleaved array:\n") ;
+      ILArray il ;
+      int block = 3 ;
+      il.Malloc(2, DIV_CEIL(n, block), 2, block - 1) ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (i%block == 0)
+          il.Write(0, i / block, array[i]) ;
+        else
+          il.Write(1, i - i / block, array[i]) ;
+      }
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (i%block == 0)
+        {
+          if (il.Read(0, i / block) != array[i])
+            ++mismatchCnt ;
+        }
+        else
+        {
+          if (il.Read(1, i - i / block) != array[i])
+            ++mismatchCnt ;
+        }
+      }
+      printf("mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (bytes): %d\n", (int)il.GetSpace());
+    }
+    
+    {
+      printf("\nInterleaved64 array:\n") ;
+      IL64Array il64 ;
+      int block = 3 ;
+      il64.Malloc(DIV_CEIL(n, block), 2, block - 1) ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (i%block == 0)
+          il64.Write0(i / block, array[i]) ;
+        else
+          il64.Write1(i - i / block, array[i]) ;
+      }
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < len ; ++i)
+      {
+        if (i%block == 0)
+        {
+          if (il64.Read0(i / block) != array[i])
+            ++mismatchCnt ;
+        }
+        else
+        {
+          if (il64.Read1(i - i / block) != array[i])
+            ++mismatchCnt ;
+        }
+      }
+      printf("mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (bytes): %d\n", (int)il64.GetSpace());
+    }
+  }
+#if 0 // comment out large chunk of the code for compile efficiency. Remove this in future
+  else if (!strcmp(argv[1], "bitvector"))
+  {
+    int k = 0 ;
+    unsigned int sum ;
+    WORD *B ;
+    size_t n = 1000000 ;
+
+    B = Utils::MallocByBits(n) ;
+    
+    //for (i = 0 ; i*1 < n ; ++i )
+    //  Utils::BitSet(B, i*1) ;
+    for (i = 0 ; i < n ; )
+    {
+      int rlen = 5 ;
+      if (argc > 2)
+        rlen = atoi(argv[2]) ;
+      for (int j = 0 ; j < rlen ; ++j)
+        Utils::BitSet(B, i + j) ;
+      i += 4 * rlen ;
+    }
+    /*for (i = 0 ; i < n ; ++i)
+    {
+      if (rand() & 1)
+        Utils::BitSet(B, i) ;
+    }*/
+
+    printf("Raw size: %d\n", (int)DIV_CEIL(n, 8)) ;
+    
+    //------
+    {
+      PrintLog("Plain bitvector:") ;
+      Bitvector_Plain bvp ;
+      bvp.SetSelectSpeed(1) ;
+      bvp.Init(B, n) ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (bvp.Access(i) != Utils::BitRead(B, i))
+          ++mismatchCnt ;
+        //printf("%d %d\n", bvc.Access(i), array.Read(i)) ;
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      sum = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+          ++sum ;
+        if (bvp.Rank(1, i) != sum)
+          ++mismatchCnt ;
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      for (int type = 1 ; type >= 1 ; --type)
+      {
+        k = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (Utils::BitRead(B, i) == type)
+          {
+            size_t s = bvp.Select(type, k + 1) ;
+            if (s != i)
+            {
+              ++mismatchCnt ;
+              //printf("mismatch %d: %d %d\n", k + 1, s, i) ;
+            }
+            ++k ;
+          }
+        }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      k = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        for (k = i ; k >= 0 ; --k)
+          if (bvp.Access(k) == 1)
+            break ;
+        if (k < 0)
+          break ;
+        if ((int)bvp.Pred(i) != k)
+          ++mismatchCnt ;
+
+      }
+      printf("Pred mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      k = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        for (k = i ; k < (int)n ; ++k)
+          if (bvp.Access(k) == 1)
+            break ;
+        if (k >= (int)n)
+          break ;
+        if ((int)bvp.Succ(i) != k)
+          ++mismatchCnt ;
+      }
+      printf("Succ mismatch count: %d\n", mismatchCnt) ;
+
+      printf("Space usage (byptes): %d\n\n", (int)bvp.GetSpace()) ;
+    }
+    
+
+    // ------
+    {
+      PrintLog("Compressed bitvector:") ;
+      Bitvector_Compressed bvc ;
+      bvc.Init(B, n) ;  
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (bvc.Access(i) != Utils::BitRead(B, i))
+          ++mismatchCnt ;
+        //printf("%d %d\n", bvc.Access(i), array.Read(i)) ;
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      sum = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+          ++sum ;
+        if (bvc.Rank(1, i/*, inclusive=1*/) != sum)
+          ++mismatchCnt ;
+        //printf("%d %d\n", bvc.Rank(i), sum) ;
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      /*mismatchCnt = 0 ;
+        k = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+        if (Utils::BitRead(B, i) == 1)
+        {
+        if (bvc.Select(k + 1) != i)
+        ++mismatchCnt ;
+        ++k ;
+        }
+        }
+        printf("Select mismatch count: %d\n", mismatchCnt) ;*/
+      printf("Space usage (byptes): %d\n\n", (int)bvc.GetSpace()) ;
+    }
+    //-----
+    {
+      PrintLog("Sparse bitvector:") ;
+      Bitvector_Sparse bvs ;
+      bvs.Init(B, n) ;  
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (bvs.Access(i) != Utils::BitRead(B, i))
+        {
+          ++mismatchCnt ;
+          //printf("%d: %d %d\n", i, bvs.Access(i), Utils::BitRead(B, i)) ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      sum = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+          ++sum ;
+        if (bvs.Rank(1, i) != sum)
+        {
+          ++mismatchCnt ;
+          //printf("compare %d: %d %d\n", i, bvs.Rank(1, i), sum) ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      k = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+        {
+          if (bvs.Select(k + 1) != i)
+            ++mismatchCnt ;
+          ++k ;
+        }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (byptes): %d\n\n", (int)bvs.GetSpace()) ;
+    }
+   
+    if (1)
+    {
+      PrintLog("Sparse bitvector load/save:") ;
+      Bitvector_Sparse bvs ;
+      bvs.Init(B, n) ;  
+
+      FILE *fp = fopen("tmp.out", "w") ;
+      bvs.Save(fp) ;
+      fclose(fp) ;
+      
+      fp = fopen("tmp.out", "r") ;
+      bvs.Load(fp) ;
+      fclose(fp) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (bvs.Access(i) != Utils::BitRead(B, i))
+        {
+          ++mismatchCnt ;
+          //printf("%d: %d %d\n", i, bvs.Access(i), Utils::BitRead(B, i)) ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      sum = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+          ++sum ;
+        if (bvs.Rank(1, i) != sum)
+        {
+          ++mismatchCnt ;
+          //printf("compare %d: %d %d\n", i, bvs.Rank(1, i), sum) ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      k = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+        {
+          if (bvs.Select(k + 1) != i)
+            ++mismatchCnt ;
+          ++k ;
+        }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (byptes): %d\n\n", (int)bvs.GetSpace()) ;
+    } 
+    
+    //-----
+    {
+      PrintLog("Run-length bitvector:") ;
+      Bitvector_RunLength bvr ;
+      bvr.Init(B, n) ;  
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (bvr.Access(i) != Utils::BitRead(B, i))
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      sum = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+          ++sum ;
+        if (bvr.Rank(1, i) != sum)
+        {
+          ++mismatchCnt ;
+          //printf("compare %d: %d %d\n", i, bvr.Rank(1, i), sum) ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      k = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (Utils::BitRead(B, i) == 1)
+        {
+          if (bvr.Select(k + 1) != i)
+          {
+            //printf("compare %d: %d %d\n", k, bvr.Select(k + 1), i) ;
+            ++mismatchCnt ;
+          }
+          ++k ;
+        }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;
+      printf("Space usage (byptes): %d\n\n", (int)bvr.GetSpace()) ;
+    }
+    free(B) ;
+  }
+  else if (!strcmp(argv[1], "sequence"))
+  {
+    char abList[] = "ACGT" ;
+    Alphabet abCode ;
+    abCode.InitFromList(abList, strlen(abList)) ;
+
+    size_t n = 1000000 ;
+    FixedSizeElemArray S ;
+    S.Malloc(2, n) ;
+    
+    if (1)
+    {
+      //FILE *fp = fopen("testdata/bwt_1M.out", "r") ;
+      FILE *fp = fopen("testdata/bwt_7M-8M.out", "r") ;
+      //FILE *fp = fopen("testdata/bwt_2M.out", "r") ;
+      //FILE *fp = fopen("testdata/tmp.out", "r") ;
+      int t ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        fscanf(fp, "%d", &t) ;
+        S.Write(i, t) ;
+      }
+      fclose(fp) ;
+    }
+    else
+    {
+      /*srand(1) ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        S.Write(i, rand()%4) ;
+      }*/
+      size_t rlen = 5 ;
+      if (argc > 2)
+        rlen = atoi(argv[2]) ;
+      uint8_t prevc = -1 ;
+      for (i = 0 ; i < n ; i += rlen)
+      {
+        uint8_t c = rand() % 4;
+        while (c == prevc)
+          c = rand() % 4 ;
+        for (size_t j = 0 ; j < rlen ; ++j)
+          S.Write(i + j, c) ;
+        prevc = c ;
+      }
+    }
+
+    printf("Raw size: %d\n", (int)S.GetSpace()) ;
+    
+    if (0)
+    {
+      printf("\nPlain+Bitvector_Plain\n") ;
+      Sequence_Plain<Bitvector_Plain> t ;
+      t.SetAlphabet(abCode) ;
+      t.Init(S, n, abList ) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          if (t.Rank(abList[j], i) != sum)
+            ++mismatchCnt ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+          if (S.Read(i) == j)
+          {
+            ++cnt ;
+            if (t.Select(abList[j], cnt) != i)
+            {
+              //printf("%d: %d %d %d\n", (int)j, cnt, t.Select(abList[j], cnt), i) ;
+              ++mismatchCnt ;
+            }
+          }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+
+    if (1)
+    {
+      printf("\nWavelet tree + plain bitvector:\n") ;
+      Sequence_WaveletTree<> t ;
+      t.SetAlphabet(abCode) ;
+      t.Init(S, n, abList ) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          if (t.Rank(abList[j], i) != sum)
+            ++mismatchCnt ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+          if (S.Read(i) == j)
+          {
+            ++cnt ;
+            //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ;
+            if (t.Select(abList[j], cnt) != i)
+            {
+              ++mismatchCnt ;
+            }
+          }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+   
+    if (0)
+    {
+      printf("\nsave/load:\n") ;
+      Sequence_Hybrid t ;
+      t.SetAlphabet(abCode) ;
+      t.Init(S, n, abList) ;
+      
+      FILE *fp = fopen("tmp.out", "w") ;
+      t.Save(fp) ;
+      fclose(fp) ;
+      
+      fp = fopen("tmp.out", "r") ;
+      t.Load(fp) ;
+      fclose(fp) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          if (t.Rank(abList[j], i) != sum)
+            ++mismatchCnt ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      /*mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+          if (S.Read(i) == j)
+          {
+            ++cnt ;
+            //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ;
+            if (t.Select(abList[j], cnt) != i)
+            {
+              ++mismatchCnt ;
+            }
+          }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;*/
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+    if (1) 
+    {
+      printf("\nWavelet tree + run-length bitvector:\n") ;
+      Sequence_WaveletTree<Bitvector_RunLength> t ;
+      t.SetAlphabet(abCode) ;
+      t.Init(S, n, abList ) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          if (t.Rank(abList[j], i) != sum)
+            ++mismatchCnt ;
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      /*mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+          if (S.Read(i) == j)
+          {
+            ++cnt ;
+            //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ;
+            if (t.Select(abList[j], cnt) != i)
+            {
+              ++mismatchCnt ;
+            }
+          }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;*/
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+    
+    {
+      printf("\nRun length:\n") ;
+      Sequence_RunLength t ;
+      //t.SetAlphabet(abCode) ;
+      t.Init(S, n, abList ) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          //printf("%d: %d %d %d\n", j, sum, t.Rank(abList[j], i)) ;
+          if (t.Rank(abList[j], i) != sum)
+          {
+            //printf("ERROR\n") ;
+            ++mismatchCnt ;
+          }
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      /*mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+          if (S.Read(i) == j)
+          {
+            ++cnt ;
+            //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ;
+            if (t.Select(abList[j], cnt) != i)
+            {
+              ++mismatchCnt ;
+            }
+          }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;*/
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+    
+    {
+      printf("\nHybrid:\n") ;
+      Sequence_Hybrid t ;
+      //t.SetAlphabet(abCode) ;
+      t.SetBlockSize(8) ;
+      t.Init(S, n, abList ) ;
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          //printf("%d: %d %d %d\n", i, j, sum, t.Rank(abList[j], i)) ;
+          if (t.Rank(abList[j], i) != sum)
+          {
+            //printf("ERROR\n") ;
+            ++mismatchCnt ;
+          }
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      /*mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+          if (S.Read(i) == j)
+          {
+            ++cnt ;
+            //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ;
+            if (t.Select(abList[j], cnt) != i)
+            {
+              ++mismatchCnt ;
+            }
+          }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;*/
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+    {
+      printf("\nRunBlock:\n") ;
+      Sequence_RunBlock t ;
+      //t.SetAlphabet(abCode) ;
+      t.Init(S, n, abList ) ;
+      
+      /*FILE *fp = fopen("tmp.out", "w") ;
+      t.Save(fp) ;
+      fclose(fp) ;
+      
+      fp = fopen("tmp.out", "r") ;
+      t.Load(fp) ;
+      fclose(fp) ;*/
+
+
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (t.Access(i) != abList[S.Read(i)])
+        {
+          ++mismatchCnt ;
+        }
+      }
+      printf("Access mismatch count: %d\n", mismatchCnt) ;
+
+      size_t j = 0 ;
+      mismatchCnt = 0 ;
+      for (j = 0 ; abList[j] ; ++j)
+      {
+        uint64_t sum = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          if (S.Read(i) == j)
+            ++sum ;
+          //printf("%d: %d %d %d\n", i, j, sum, t.Rank(abList[j], i)) ;
+          if (t.Rank(abList[j], i) != sum)
+          {
+            //printf("ERROR\n") ;
+            ++mismatchCnt ;
+          }
+        }
+      }
+      printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+      /*mismatchCnt = 0 ;
+        for (j = 0 ; abList[j] ; ++j)
+        {
+        int cnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        if (S.Read(i) == j)
+        {
+        ++cnt ;
+      //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ;
+      if (t.Select(abList[j], cnt) != i)
+      {
+      ++mismatchCnt ;
+      }
+      }
+      }
+      printf("Select mismatch count: %d\n", mismatchCnt) ;*/
+
+      printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ;
+    }
+  }
+  else if (!strcmp(argv[1], "hash"))
+  {
+    const int n = 20 ;
+    uint64_t array[n] ;
+
+    for (i = 0 ; i < n ; ++i)
+      array[i] = i ;
+    UniversalHashGenerator uh ;
+    uint64_t a, b ;
+    int j ;
+    uh.Init(2 * n, /*seed=*/0) ;
+    printf("Universal hash:\n") ;
+    for (j = 0 ; j < 3 ; ++j)
+    {
+      uh.Generate(a, b) ;
+      printf("Hash%d %llu %llu\n", j, (long long unsigned)a, (long long unsigned)b) ;
+      for (i = 0 ; i < n ; ++i)
+        printf("%d ", (int)uh.Map(a, b, array[i])) ;
+      printf("\n") ;
+    }
+    printf("\n") ;
+
+    PerfectHash perfhash ;
+    perfhash.Init(array, n, /*m=*/0) ;
+    printf("Perfect hash:\n") ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      printf("%d ", (int)perfhash.Map(array[i])) ;
+    }
+    printf("\n") ;
+    printf("Space usage (bytes): %d\n", (int)perfhash.GetSpace()) ;
+  }
+  else if (!strcmp(argv[1], "huffman"))
+  {
+    const int n = 4 ;
+    uint64_t freq[n] = {5, 10, 100, 1};
+    HuffmanCode huffmanCode ;
+    huffmanCode.InitFromFrequency(freq, n) ;
+
+    printf("Huffman code:\n") ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      int l = 0 ;
+      WORD code = huffmanCode.Encode(i, l) ;
+      printf("%d %d: %llu %d => %d\n", (int)i, (int)freq[i], (long long unsigned)code, l,
+          huffmanCode.Decode(code, l)) ;
+    }
+    printf("Space usage (bytes): %d\n", (int)huffmanCode.GetSpace()) ;
+  }
+  else if (!strcmp(argv[1], "partialsum"))
+  {
+    const int n = 100 ;
+    int array[n] ;//= {0, 0, 0};
+    for (i = 0 ; i < n ; ++i)
+      array[i] = i ;
+    array[10] = 0 ;
+
+    PartialSum psum ;
+    psum.Init(array, n) ;
+    
+    printf("Succinct partial sum:\n") ;
+    int s = 0 ;
+    mismatchCnt = 0 ; 
+    for (i = 0 ; i <= n ; ++i)
+    {
+      if (s != (int)psum.Sum(i))
+      {
+        ++mismatchCnt ;
+      }
+      s += array[i] ;
+    }
+    printf("Sum query mismatch count: %d\n", mismatchCnt) ;
+    
+    mismatchCnt = 0 ;
+    int j ;
+    s = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      for (j = 0 ; j < array[i] ; ++j)
+      {
+        if (psum.Search(s + j) != i)
+          ++mismatchCnt ;
+      }
+      s += array[i] ; 
+    }
+    printf("Search mismatch count: %d\n", mismatchCnt) ;
+
+    mismatchCnt = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (psum.AccessValue(i) != array[i])
+        ++mismatchCnt ;
+    }
+    printf("AccessValue mismatch count: %d\n", mismatchCnt) ; 
+
+    printf("Space usage (bytes): %d\n", (int)psum.GetSpace()) ;
+  }
+  else if (!strcmp(argv[1], "sa"))
+  {
+    FixedSizeElemArray s ;
+    size_t n = 10000 ;
+    s.Malloc(2, n) ;
+    srand(1) ;
+    for (i = 0 ; i < (size_t)n ; ++i)
+    {
+      s.Write(i, rand() % 4) ;
+      //s.Write(i, i % 4) ;
+      //printf("%d ", s.Read(i)) ;
+    }
+    //printf("\n") ;
+    /*std::vector<size_t> truth ;
+    for (j = 0 ; j <= 3 ; ++j)
+    {
+      int i ;
+      for (i = n - 1 - (n - 1) % 4 + j ; i >= 0 ; i -= 4)
+      {
+        if (i >= n)
+          continue ;
+        truth.push_back(i) ;
+      }
+    }*/
+    
+    // Check the cuts
+    SuffixArrayGenerator saGenerator ;
+    size_t cutCnt = saGenerator.Init(s, n, n / 4, /*diffcov_v=*/4096, 4) ;
+    /*for (i = 0 ; i < cutCnt ; ++i)
+    {
+      std::vector< std::vector<size_t> > pos = saGenerator.GetChunksPositions(s, n, i, i) ;
+      //printf("%d\n", pos[0].size()) ;
+      int size = pos[0].size() ;
+      int j ;
+      for (j = 0 ; j < size ; ++j)
+        printf("%d ", pos[0][j]) ;
+      printf("\n") ;
+    }*/
+    size_t *sa = (size_t *)malloc(sizeof(size_t) * n);
+    size_t calculated = 0 ;
+    for (i = 0 ; i < cutCnt ; ++i)
+    {
+      std::vector< std::vector<size_t> > pos ;
+      saGenerator.GetChunksPositions(s, n, i, i, 0, n - 1, pos) ;
+      int size = pos[0].size() ;
+      printf("%lu %d. %lu\n", i, size, calculated) ;
+      saGenerator.SortSuffixByPos(s, n, pos[0].data(), size, sa + calculated) ;
+      calculated += size ;
+    }
+    printf("Validate result: %d\n", saGenerator.ValidateSA(s, n, sa)) ;
+    free(sa) ;
+    
+    /*mismatchCnt = 0 ;
+    for (i = 0 ; i < (size_t)n ; ++i)
+    {
+      if (truth[i] != sa[i])
+        ++mismatchCnt ;
+      //printf("%d %d\n", (int)truth[i], (int)sa[i]) ;
+    }
+    printf("SA mismatch: %d\n", mismatchCnt) ;*/
+    
+  }
+  else if (!strcmp(argv[1], "fm"))
+  {
+    FixedSizeElemArray s ;
+    const size_t n = 10000 ;
+    const size_t testLen = 50 ;
+    s.Malloc(2, n) ;
+    char strs[n + 1] ;
+    srand(1) ;
+    char abList[] = "ACGT" ;
+    for (i = 0 ; i < (size_t)n ; ++i)
+    {
+      int r = rand() % 4 ;
+      s.Write(i, r) ;
+      strs[i] = abList[r] ; 
+    }
+    //s.Print(stdout) ;
+    //printf("%s\n", strs) ;
+    struct _FMBuilderParam param ;
+    struct _FMIndexAuxData fmAuxData ;
+    param.threadCnt = 4 ;
+    param.saBlockSize = n / 4 ;
+    FixedSizeElemArray BWT ;
+    param.precomputeWidth = testLen > 10 ? 10 : testLen ;
+    param.maxLcp = 17 ;
+    
+    size_t firstISA = 0 ;
+    param.selectedISA[0] = 0 ; 
+    param.selectedISA[1] = 0 ; 
+    FMBuilder::Build(s, n, 4, BWT, firstISA, param) ;
+
+    Sequence_RunBlock t ;
+    t.Init(BWT, n, abList) ;
+
+    //BWT.Print(stdout) ;
+    //
+    //printf("%d %d\n", precomputedRange[0].first, precomputedRange[0].second) ;
+    /*for (i = 0 ; i < 100 ; ++i)
+    {
+      if (precomputedRange[i].second > 0)
+        printf("%d %d\n", precomputedRange[i].first, precomputedRange[i].second) ;
+    }*/
+    size_t count = 0 ;
+    for (i = 0 ; i < n ; i += WORDBITS)
+      count += Utils::Popcount(param.semiLcpGreater[i / WORDBITS]) ;
+    printf("Number of 1s in semiLcpGreater: %lu\n", count) ;
+    
+    count = 0 ;
+    for (i = 0 ; i < n ; i += WORDBITS)
+      count += Utils::Popcount(param.semiLcpEqual[i / WORDBITS]) ;
+    printf("Number of 1s in semiLcpEqual: %lu\n", count) ;
+    
+    FMIndex< Sequence_WaveletTree<Bitvector_Plain> > fmIndex ;
+    //FMIndex< Sequence_Plain<Bitvector_Plain> > fmIndex ;
+    //FMIndex< Sequence_RunBlock > fmIndex ;
+    fmIndex.Init(BWT, n, firstISA, 
+        param,
+        abList, strlen(abList)) ;
+    printf("firstISA = %lu; lastISA = %lu\n", firstISA, fmIndex.GetLastISA()) ;
+
+    size_t sp, ep, l ;
+    char test[testLen + 1] ;
+    test[testLen] = '\0' ;
+    size_t k ;
+    size_t mismatchCnt = 0 ;
+    size_t compareCnt = 0 ;
+    for (k = 0 ; k + testLen <= n; ++k)
+    {
+      memcpy(test, strs + k, testLen) ;
+      //strcpy(test, "GATGGAGATG") ;
+      //printf("test: %s\n", test) ;
+      l = fmIndex.BackwardSearch(test, strlen(test), sp, ep) ;
+      //printf("Backward search %d %d %d\n", l, sp, ep) ;
+      if (sp < ep)
+        continue ;
+      ++compareCnt ;
+      for (i = sp ; i <= ep ; ++i)
+      {
+        size_t sa = fmIndex.BackwardToSampledSA(i, l) ; 
+        if (sa + l != k)
+        {
+          ++mismatchCnt ;
+          printf("SA[%lu] = %lu+%lu. %lu\n", i, sa, l, k) ;
+        }
+      }
+    }
+    printf("Mismatch count: %lu out of %lu\n", mismatchCnt, compareCnt) ;
+
+    /*FILE *fp = fopen("tmp.out", "w") ;
+    fmIndex.Save(fp) ;
+    fclose(fp) ;
+    
+    fp = fopen("tmp.out", "r") ;
+    fmIndex.Load(fp) ;
+    fclose(fp) ;
+
+    printf("Save/Load:\n") ;
+    l = fmIndex.BackwardSearch(test, strlen(test), sp, ep) ;
+    printf("Backward search %d %d %d\n", l, sp, ep) ;
+    for (i = sp ; i <= ep ; ++i)
+    {
+      size_t sa = fmIndex.BackwardToSampledSA(i, l) ; 
+      printf("SA[%d] = %d+%d\n", i, sa, l) ;
+    }*/
+
+    
+    //free(sampledSa) ;
+    //free(precomputedRange) ;
+    //free(semiLcpGreater) ;
+    //free(semiLcpEqual) ;
+  }
+  else if (!strcmp(argv[1], "diffcover"))
+  {
+    DifferenceCover dc ;
+    unsigned int v = 4096 ;
+    dc.Init(v) ;
+    size_t j ;
+    mismatchCnt = 0 ;
+    for (i = 0 ; i < v ; ++i)
+      for (j = 0 ; j < v ; ++j)
+      {
+        int d = dc.Delta(i, j) ;
+        if (!dc.IsInDC(i + d) || !dc.IsInDC(j + d))
+        {
+          ++mismatchCnt ;
+        }
+      }
+    printf("%d\n", mismatchCnt) ;
+  }
+  else if (!strcmp(argv[1], "permutation"))
+  {
+    const int n = 1000;
+    size_t *perm = new size_t[n];
+    size_t *inv = new size_t[n] ;
+    for (i = 0 ; i < n ; ++i)
+      perm[i] = (i + 1)%n ;
+    printf("Raw permutation size %lu\n", n * sizeof(perm[0])) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (0)
+      {
+        size_t tmp ;
+        size_t j = i + rand() % (n - i) ;
+        tmp = perm[j] ;
+        perm[j] = perm[i] ;
+        perm[i] = tmp ;
+      }
+      //perm[i] = (i*10001+1)%n ;
+      inv[ perm[i] ] = i ;
+    }
+    //for (i = 0 ; i < n ; ++i)
+    //  printf("%d ", perm[i]) ;
+    //printf("\n") ;
+
+    {
+      printf("\ninverse permutation\n") ;
+      DS_InvPermutation invperm ;
+      invperm.Init(perm, n) ;
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+        if (invperm.Query(perm, i) != inv[i])
+          ++mismatchCnt ;
+      printf("Inverse mismatch count %d\n", mismatchCnt) ;
+      printf("Space usage: %d\n", (int)invperm.GetSpace()) ;
+    }
+
+    {
+      printf("\ncompressed permutation\n") ;
+      Permutation cperm ;
+      cperm.Init(perm, n) ;
+      
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (cperm.Next(i) != perm[i])
+          ++mismatchCnt ;
+      }
+      printf("Next mismatch count %d\n", mismatchCnt) ;
+      
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < n ; ++i)
+      {
+        if (cperm.Prev(i) != inv[i])
+          ++mismatchCnt ;
+      }
+      printf("Prev mismatch count %d\n", mismatchCnt) ;
+      
+      printf("Space usage: %d\n", (int)cperm.GetSpace()) ;
+    }
+
+    delete[] perm ;
+    delete[] inv ;
+  }
+  else if (!strcmp(argv[1], "invindex"))
+  {
+    size_t n = 10000 ;
+    FixedSizeElemArray a ;
+    a.Malloc(3, n) ;
+    size_t i ;
+    srand(17) ;
+    int stride = 5 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      a.Write(i, i%stride) ;
+    }
+
+    InvertedIndex idx ;
+    idx.Init(a, n, false) ;
+    size_t mismatchCnt = 0 ;
+    printf("Raw sequence space usage %lu\n", a.GetSpace()) ;
+    int label = 1 ;
+    for (i = label ; i < n ; i += stride)
+    {
+      if (idx.Search(label, i / stride) != i)
+      {
+        //printf("%lu %lu\n", i, idx.Search(label, i / stride)) ;
+        ++mismatchCnt ;
+      }
+    }
+    printf("Inverted index mismatch count %lu\n", mismatchCnt) ;
+    printf("Inverted index based on permutation space usage %lu\n", idx.GetSpace()) ;
+  }
+#endif
+  else if (!strcmp(argv[1], "rmmtree"))
+  {
+    int n = 1000000 ; 
+    int i, j ;
+    WORD *B = Utils::MallocByBits(n) ;
+    printf("Raw representation space usage: %lu\n", Utils::BitsToWordBytes(n)) ;
+    srand(1) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (rand() & 1)
+      //if (i < n / 2)
+      //if (i % 2 == 0)
+        Utils::BitSet(B, i) ;
+    }
+    
+    DS_RangeMinMaxTree rmmTree ;
+    rmmTree.SetBlockSize(32) ;
+    rmmTree.Init(B, n) ;
+    
+    // Test forward and backward search
+    if (0)
+    {
+      int d ;
+      int stride = 11 ;
+      for (d = -stride ; d <= stride ; d += 2 * stride)
+      {
+        mismatchCnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          int excess = 0 ;
+          for (j = i ; j < n ; ++j)
+          {
+            excess += 2 * Utils::BitRead(B, j) - 1 ;
+            if (excess == d)
+              break ;
+          }
+          int truth = j ;
+          j = rmmTree.FwdSearch(i, d, B, n) ;
+          if (j != truth)
+            ++mismatchCnt ;
+          //if (j != truth)
+          //  printf("%d %d %d\n", i, truth, j) ;
+        }
+        printf("FwdSearch %d mismatch count %u\n", d, mismatchCnt) ;
+      }
+
+      for (d = -stride ; d <= stride ; d += 2 * stride)
+      {
+        mismatchCnt = 0 ;
+        for (i = 0 ; i < n ; ++i)
+        {
+          int excess = 0 ;
+          for (j = i ; j >= 0 ; --j)
+          {
+            excess -= (2 * Utils::BitRead(B, j) - 1) ;
+            if (excess == d)
+              break ;
+          }
+          int truth = j ;
+          if (truth == -1)
+            truth = n ;
+          j = rmmTree.BwdSearch(i, d, B, n) ;
+          if (j != truth)
+            ++mismatchCnt ;
+          //if (j != truth)
+          //  printf("%d %d %d\n", i, truth, j) ;
+        }
+        printf("BwdSearch %d mismatch count %u\n", d, mismatchCnt) ;
+      }
+    }
+    
+    // Test rmq and rMq
+    {
+      int len = 10000 ;
+      mismatchCnt = 0 ;
+      for (i = 0 ; i + len <= n ; ++i)
+      {
+        int excess = 0 ;
+        int min = 2 ;
+        int mintag = i, maxtag = i;
+        int max = -2 ;
+        int minCnt = 0 ;
+        int lastMinTag = 0 ;
+        for (j = i ; j < i + len ; ++j)
+        {
+          excess += (2 * Utils::BitRead(B, j) - 1) ;
+          if (excess < min)
+          {
+            min = excess ;
+            mintag = j ;
+            minCnt = 1 ;
+            lastMinTag = j ;
+          }
+          else if (excess == min)
+          {
+            ++minCnt ;
+            lastMinTag = j ;
+          }
+
+          if (excess > max)
+          {
+            max = excess ;
+            maxtag = j ; 
+          }
+        }
+        
+        //printf("%d %d\n", rmmTree.ExtremeExcess(B, n, i, i + len - 1, 0), min) ;
+        if (rmmTree.ExtremeExcess(i, i + len - 1, 0, B, n) != min)
+        {
+          ++mismatchCnt ;
+          //printf("min mismatch %d\n", i) ;
+        }
+        if (rmmTree.ExtremeExcess(i, i + len - 1, 1, B, n) != max)
+        {
+          ++mismatchCnt ;
+          //printf("max mismatch %d\n", i) ;
+        }
+
+        if ((int)rmmTree.Rmq(i, i + len - 1, B, n) != mintag)
+        {
+          ++mismatchCnt ;
+        }
+        
+        if ((int)rmmTree.RMq(i, i + len - 1, B, n) != maxtag)
+        {
+          ++mismatchCnt ;
+        }
+
+        if ((int)rmmTree.MinCount(i, i + len - 1, B, n) != minCnt)
+        {
+          ++mismatchCnt ;
+          //printf("min count mismatch %d: %d %d\n", i, min, minCnt) ;
+        }
+
+        if ((int)rmmTree.MinSelect(i, i + len - 1, minCnt, B, n) != lastMinTag)
+        {
+          ++mismatchCnt ;
+          //printf("min select mismatch %d: %d %d %d\n", i, min, minCnt, lastMinTag) ;
+        }
+      }
+      printf("extreme excess mismatch count %u\n", mismatchCnt) ;
+    }
+
+    printf("rmmTree space usage (bytes): %lu\n", rmmTree.GetSpace(true)) ;
+  }
+  else if (!strcmp(argv[1], "tree"))
+  {
+    // Test example: tree with child count 2 (from root), 3, 4, 5, ....
+    // Or a binary tree
+    int i, j ;
+    Tree_Plain tree ;
+    tree.Init() ;
+    
+    int internalN = 10000 ; 
+    srand(1) ;
+    for (i = 0 ; i < internalN ; ++i)
+    {
+      int childCnt = rand() % 4 + 1 ;
+      for (j = 0 ; j < childCnt ; ++j)
+      {
+        size_t tid = tree.AddNode(i) ;
+        tree.SetLabel(tid, childCnt) ;
+      }
+    }
+
+    size_t *map = new size_t[tree.GetSize()] ;
+    
+    if (0)
+    {
+      // This test is for binary tree.
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        if (tree.ChildrenCount(i) != 2 || (int)tree.FirstChild(i) != (2 * i + 1) 
+            || (int)tree.LastChild(i) != (2 * i + 2))
+          ++mismatchCnt ;
+      }
+      printf("plain tree mismatch count %u\n", mismatchCnt) ;
+      //printf("plain tree space usage (bytes): %lu\n", tree.GetSpace(true)) ;
+    }
+    printf("plain tree space usage (bytes): %lu\n", tree.GetSpace(true)) ;
+
+    {
+      Tree_LOUDS t ;
+      mismatchCnt = 0 ;
+    
+      t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        if (tree.IsLeaf(i))
+          continue ;
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenCount(v) != tree.ChildrenCount(i) 
+            || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] 
+            || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)]
+            || t.ChildRank(v) != tree.ChildRank(i)
+            || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)])
+            || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)])
+            || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)]
+            || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)]
+            )
+          ++mismatchCnt ;
+      }
+      printf("\nLOUDS tree mismatch count %u\n", mismatchCnt) ;
+      printf("LOUDS tree space usage (bytes): %lu\n", t.GetSpace()) ;
+    }
+    
+    {
+      Tree_BP t ;
+      mismatchCnt = 0 ;
+
+      t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        if (tree.IsLeaf(i))
+          continue ;
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenCount(v) != tree.ChildrenCount(i) 
+            || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] 
+            || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)]
+            || t.ChildRank(v) != tree.ChildRank(i)
+            || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)]
+            || t.NodeMap(t.ChildSelect(v, 1)) != map[tree.ChildSelect(i, 1)]
+            || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)])
+            || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)])
+            || (t.IsAncestor(v, t.NodeSelect(map[internalN])) != tree.IsAncestor(i, internalN))
+            || t.Depth(v) != tree.Depth(i)
+            || t.SubTreeSize(v) != tree.SubTreeSize(i)
+            || t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i)
+            || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)]
+           )
+        {
+          ++mismatchCnt ;
+          //printf("%d %d. %d. %d %d. %d\n", t.LeafCountInSubTree(v), tree.LeafCountInSubTree(i), tree.GetSize(), v, i, tree.ChildrenCount(i)) ;
+        }
+      }
+      
+      for (i = internalN ; i < (int)tree.GetSize(); ++i)
+      {
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i))
+          //|| (int)t.LeafRank(v) != i - internalN + 1)
+          ++mismatchCnt ;
+      }
+
+      printf("\nBP tree mismatch count %u\n", mismatchCnt) ;
+      printf("BP tree space usage (bytes): %lu\n", t.GetSpace()) ;
+    }
+    
+    {
+      Tree_DFUDS t ;
+      mismatchCnt = 0 ;
+
+      t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        if (tree.IsLeaf(i))
+          continue ;
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenCount(v) != tree.ChildrenCount(i) 
+            || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] 
+            || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)]
+            || t.ChildRank(v) != tree.ChildRank(i)
+            || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)]
+            || t.NodeMap(t.ChildSelect(v, 1)) != map[tree.ChildSelect(i, 1)]
+            || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)])
+            || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)])
+            || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)])
+            || (t.IsAncestor(v, t.NodeSelect(map[internalN])) != tree.IsAncestor(i, internalN))
+            //|| t.Depth(v) != tree.Depth(i)
+            || t.SubTreeSize(v) != tree.SubTreeSize(i)
+            || t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i)
+            || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)]
+           )
+          ++mismatchCnt ;
+        //printf("%d %d %d\n", v,
+        //    t.NodeSelect(map[internalN]),
+        //    t.LCA(v, t.NodeSelect(map[internalN]))) ;
+        //printf("%d %d. %d. %d %d. %d\n", t.LeafCountInSubTree(v), tree.LeafCountInSubTree(i), tree.GetSize(), v, i, tree.ChildrenCount(i)) ;
+      }
+
+      for (i = internalN ; i < (int)tree.GetSize(); ++i)
+      {
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i))
+          ++mismatchCnt ;
+      }
+      printf("\nDFUDS tree mismatch count %u\n", mismatchCnt) ;
+      printf("DFUDS tree space usage (bytes): %lu\n", t.GetSpace()) ;
+    }
+    
+    {
+      Tree_Labeled<> t ;
+      mismatchCnt = 0 ;
+
+      t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        if (tree.IsLeaf(i))
+          continue ;
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenCount(v) != tree.ChildrenCount(i) 
+            || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] 
+            || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)]
+            || t.ChildRank(v) != tree.ChildRank(i)
+            || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)]
+            || t.NodeMap(t.ChildSelect(v, 1)) != map[tree.ChildSelect(i, 1)]
+            || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)])
+            || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)])
+            || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)])
+            || (t.IsAncestor(v, t.NodeSelect(map[internalN])) != tree.IsAncestor(i, internalN))
+            //|| t.Depth(v) != tree.Depth(i)
+            || t.SubTreeSize(v) != tree.SubTreeSize(i)
+            || t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i)
+            || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)]
+           )
+          ++mismatchCnt ;
+        //printf("%d %d %d\n", v,
+        //    t.NodeSelect(map[internalN]),
+        //    t.LCA(v, t.NodeSelect(map[internalN]))) ;
+        //printf("%d %d. %d. %d %d. %d\n", t.LeafCountInSubTree(v), tree.LeafCountInSubTree(i), tree.GetSize(), v, i, tree.ChildrenCount(i)) ;
+      }
+
+      // Labels
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenLabeled(v, 3) != tree.ChildrenLabeled(i, 3)
+            || (tree.ChildrenLabeled(i, 3 ) > 0 && t.NodeMap(t.LabeledChildSelect(v, 3, 2)) != map[tree.LabeledChildSelect(i, 3, 2)])
+            || t.ChildLabel(v) != tree.ChildLabel(i)
+            )
+        {
+         // printf("%d %d %d: %d %d %d\n", i, map[i], v, t.ChildrenLabeled(v, 1), tree.ChildrenLabeled(i, 1), 
+        //      tree.ChildrenCount(i)) ;
+          ++mismatchCnt ;
+        }
+      }
+      
+      printf("\nLabeled tree mismatch count %u\n", mismatchCnt) ;
+      printf("Labeled tree space usage (bytes): %lu\n", t.GetSpace()) ;
+    }
+    
+    delete[] map ;
+  }
+  else if (!strcmp(argv[1], "patternrs"))
+  {
+    int k = 0 ;
+    unsigned int sum ;
+    WORD *B ;
+    size_t n = 1000000  ;
+    n = 65536*32 ;
+
+    B = Utils::MallocByBits(n) ;
+
+    //for (i = 0 ; i*1 < n ; ++i )
+    //  Utils::BitSet(B, i*1) ;
+    /*for (i = 0 ; i < n ; )
+      {
+      int rlen = 5 ;
+      if (argc > 2)
+      rlen = atoi(argv[2]) ;
+      for (int j = 0 ; j < rlen ; ++j)
+      Utils::BitSet(B, i + j) ;
+      i += 4 * rlen ;
+      }*/
+    /*for (i = 0 ; i < n ; ++i)
+    {
+      if (rand() & 1)
+        Utils::BitSet(B, i) ;
+    }*/
+    DS_Parenthesis tmp ;
+    tmp.GenerateRandomBalanceParenthesis(B, n) ;
+
+    printf("Raw size: %d\n", (int)DIV_CEIL(n, 8)) ;
+
+    WORD pat = 2 ;  // binary 10
+    int patLen = 2 ;
+    DS_PatternRankSelect patrs ;
+    mismatchCnt = 0 ;
+    sum = 0 ;
+    patrs.Init(B, n, pat, patLen) ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (patrs.IsPattern(i, B, n))
+        ++sum ;
+      if (patrs.Rank(i, B, n) != sum)
+        ++mismatchCnt ;
+    }
+    printf("Rank mismatch count: %d\n", mismatchCnt) ;
+
+    mismatchCnt = 0 ;
+    k = 0 ;
+    for (i = 0 ; i < n ; ++i)
+    {
+      if (patrs.IsPattern(i, B, n))
+      {
+        size_t s = patrs.Select(k + 1, B, n) ;
+        if (s != i)
+        {
+          ++mismatchCnt ;
+          //printf("mismatch %d: %d %d\n", k + 1, s, i) ;
+        }
+        ++k ;
+      }
+    }
+    printf("Select mismatch count: %d (%d)\n", mismatchCnt, k) ;
+    printf("DS_PatternRankSelect space: %lu\n", patrs.GetSpace()) ;
+  }
+  else if (!strcmp(argv[1], "cardtree")) // cardinal tree
+  {
+    // Test example: tree with child count 2 (from root), 3, 4, 5, ....
+    // Or a binary tree
+    int i, j ;
+    int c = 4 ; // cardinality
+
+    Tree_Cardinal_Plain tree ;
+    tree.Init(c) ;
+    
+    int internalN = 10000 ; 
+    srand(1) ;
+    for (i = 0 ; i < internalN ; ++i)
+    {
+      //int childCnt = rand() % c + 2 ;
+      //int childCnt = c ;
+      int step = rand() % c + 1 ;
+      //step = 1 ;
+      for (j = 0 ; j < c ; j += step)
+        tree.AddNode(i, j) ;
+    }
+    
+    size_t *map = new size_t[tree.GetSize()] ;
+    if (0)
+    {
+      // This test is for binary tree.
+      mismatchCnt = 0 ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        if (tree.ChildrenCount(i) != 2 || (int)tree.FirstChild(i) != (2 * i + 1) 
+            || (int)tree.LastChild(i) != (2 * i + 2))
+          ++mismatchCnt ;
+      }
+      printf("plain cardinal tree mismatch count %u\n", mismatchCnt) ;
+    }
+    printf("plain cardinal tree space usage (bytes): %lu\n", tree.GetSpace(true)) ;
+    
+    if (1)
+    {
+      Tree_Cardinal_LOUDS<> t ;
+      mismatchCnt = 0 ;
+
+      t.Init(tree.GetTreeData().data(), tree.GetSize(), c, map) ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenCount(v) != tree.ChildrenCount(i) 
+            || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] 
+            || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)]
+            || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)]
+            || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)]) 
+            || t.ChildRank(v) != tree.ChildRank(i)
+            || t.NodeMap(t.LCA(v, t.NodeSelect(internalN))) != map[tree.LCA(i, internalN)]
+            )
+          ++mismatchCnt ;
+        //printf("%d: %d %d\n", v, t.NodeMap(t.Parent(v)), tree.Parent(i)) ;
+      }
+      printf("\nLOUDS cardinal tree mismatch count %u\n", mismatchCnt) ;
+      printf("LOUDS cardinal tree space usage (bytes): %lu\n", t.GetSpace()) ;
+    }
+    
+    {
+      Tree_Cardinal_Ordinal<> t ;
+      mismatchCnt = 0 ;
+
+      t.Init(tree.GetTreeData().data(), tree.GetSize(), c, map) ;
+      for (i = 0 ; i < internalN ; ++i)
+      {
+        size_t v = t.NodeSelect(map[i]) ;
+        if (t.ChildrenCount(v) != tree.ChildrenCount(i) 
+            || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] 
+            || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)]
+            || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)]
+            || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)]) 
+            || t.ChildRank(v) != tree.ChildRank(i)
+            || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)]
+           )
+          ++mismatchCnt ;
+        //printf("%d %d %d: %d %d\n", v, i, map[i], t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))), map[tree.LCA(i, internalN)]) ;
+      }
+      printf("\nDFUDS cardinal tree mismatch count %u\n", mismatchCnt) ;
+      printf("DFUDS cardinal tree space usage (bytes): %lu\n", t.GetSpace()) ;
+    }
+    delete[] map ;
+  }
+
+  PrintLog("Done") ;
+  return 0 ;
+}