From 266f3414053ae7e42b3e664367cc5befefd8cc97 Mon Sep 17 00:00:00 2001 From: mourisl Date: Sat, 18 Nov 2023 19:31:14 -0500 Subject: [PATCH] Add back compactds as source code instead of submodule --- compactds/Alphabet.hpp | 196 ++ compactds/Bitvector.hpp | 70 + compactds/Bitvector_Compressed.hpp | 370 ++++ compactds/Bitvector_Plain.hpp | 206 ++ compactds/Bitvector_RunLength.hpp | 228 ++ compactds/Bitvector_Sparse.hpp | 336 +++ compactds/CompactMapper.hpp | 130 ++ compactds/CompressedSuffixArray.hpp | 154 ++ compactds/DS_InvPermutation.hpp | 137 ++ compactds/DS_Parenthesis.hpp | 150 ++ compactds/DS_PatternRankSelect.hpp | 272 +++ compactds/DS_RangeMinMaxTree.hpp | 920 ++++++++ compactds/DS_Rank.hpp | 298 +++ compactds/DS_Select.hpp | 753 +++++++ compactds/DS_Select_Test.hpp | 537 +++++ compactds/DifferenceCover.hpp | 201 ++ compactds/EliasCode.hpp | 74 + compactds/FMBuilder.hpp | 504 +++++ compactds/FMIndex.hpp | 491 +++++ compactds/FixedSizeElemArray.hpp | 322 +++ compactds/FractionBitElemArray.hpp | 118 ++ compactds/HuffmanCode.hpp | 230 ++ compactds/InterleavedFixedSizeElemArray.hpp | 238 +++ compactds/InvertedIndex.hpp | 131 ++ compactds/Makefile | 31 + compactds/PartialSum.hpp | 140 ++ compactds/PerfectHash.hpp | 199 ++ compactds/Permutation.hpp | 237 +++ compactds/Sequence.hpp | 48 + compactds/SequenceCompactor.hpp | 76 + compactds/Sequence_Hybrid.hpp | 328 +++ compactds/Sequence_Permutation.hpp | 70 + compactds/Sequence_Plain.hpp | 101 + compactds/Sequence_RunBlock.hpp | 363 ++++ compactds/Sequence_RunLength.hpp | 191 ++ compactds/Sequence_WaveletTree.hpp | 338 +++ compactds/SimpleVector.hpp | 388 ++++ compactds/SuffixArrayGenerator.hpp | 725 +++++++ compactds/Tree.hpp | 167 ++ compactds/Tree_BP.hpp | 316 +++ compactds/Tree_Cardinal.hpp | 43 + compactds/Tree_Cardinal_LOUDS.hpp | 203 ++ compactds/Tree_Cardinal_Ordinal.hpp | 167 ++ compactds/Tree_Cardinal_Plain.hpp | 255 +++ compactds/Tree_DFUDS.hpp | 283 +++ compactds/Tree_LOUDS.hpp | 171 ++ compactds/Tree_Labeled.hpp | 256 +++ compactds/Tree_Plain.hpp | 277 +++ compactds/UniversalHashGenerator.hpp | 81 + compactds/Utils.hpp | 292 +++ compactds/VariableSizeElemArray.hpp | 33 + .../VariableSizeElemArray_DensePointers.hpp | 144 ++ .../VariableSizeElemArray_DirectAccess.hpp | 76 + .../VariableSizeElemArray_SampledPointers.hpp | 118 ++ compactds/bitvector_benchmark.cpp | 162 ++ compactds/notes.md | 7 + compactds/rbbwt.cpp | 148 ++ compactds/test.cpp | 1881 +++++++++++++++++ 58 files changed, 15381 insertions(+) create mode 100644 compactds/Alphabet.hpp create mode 100644 compactds/Bitvector.hpp create mode 100644 compactds/Bitvector_Compressed.hpp create mode 100644 compactds/Bitvector_Plain.hpp create mode 100644 compactds/Bitvector_RunLength.hpp create mode 100644 compactds/Bitvector_Sparse.hpp create mode 100644 compactds/CompactMapper.hpp create mode 100644 compactds/CompressedSuffixArray.hpp create mode 100644 compactds/DS_InvPermutation.hpp create mode 100644 compactds/DS_Parenthesis.hpp create mode 100644 compactds/DS_PatternRankSelect.hpp create mode 100644 compactds/DS_RangeMinMaxTree.hpp create mode 100644 compactds/DS_Rank.hpp create mode 100644 compactds/DS_Select.hpp create mode 100644 compactds/DS_Select_Test.hpp create mode 100644 compactds/DifferenceCover.hpp create mode 100644 compactds/EliasCode.hpp create mode 100644 compactds/FMBuilder.hpp create mode 100644 compactds/FMIndex.hpp create mode 100644 compactds/FixedSizeElemArray.hpp create mode 100644 compactds/FractionBitElemArray.hpp create mode 100644 compactds/HuffmanCode.hpp create mode 100644 compactds/InterleavedFixedSizeElemArray.hpp create mode 100644 compactds/InvertedIndex.hpp create mode 100644 compactds/Makefile create mode 100644 compactds/PartialSum.hpp create mode 100644 compactds/PerfectHash.hpp create mode 100644 compactds/Permutation.hpp create mode 100644 compactds/Sequence.hpp create mode 100644 compactds/SequenceCompactor.hpp create mode 100644 compactds/Sequence_Hybrid.hpp create mode 100644 compactds/Sequence_Permutation.hpp create mode 100644 compactds/Sequence_Plain.hpp create mode 100644 compactds/Sequence_RunBlock.hpp create mode 100644 compactds/Sequence_RunLength.hpp create mode 100644 compactds/Sequence_WaveletTree.hpp create mode 100644 compactds/SimpleVector.hpp create mode 100644 compactds/SuffixArrayGenerator.hpp create mode 100644 compactds/Tree.hpp create mode 100644 compactds/Tree_BP.hpp create mode 100644 compactds/Tree_Cardinal.hpp create mode 100644 compactds/Tree_Cardinal_LOUDS.hpp create mode 100644 compactds/Tree_Cardinal_Ordinal.hpp create mode 100644 compactds/Tree_Cardinal_Plain.hpp create mode 100644 compactds/Tree_DFUDS.hpp create mode 100644 compactds/Tree_LOUDS.hpp create mode 100644 compactds/Tree_Labeled.hpp create mode 100644 compactds/Tree_Plain.hpp create mode 100644 compactds/UniversalHashGenerator.hpp create mode 100644 compactds/Utils.hpp create mode 100644 compactds/VariableSizeElemArray.hpp create mode 100644 compactds/VariableSizeElemArray_DensePointers.hpp create mode 100644 compactds/VariableSizeElemArray_DirectAccess.hpp create mode 100644 compactds/VariableSizeElemArray_SampledPointers.hpp create mode 100644 compactds/bitvector_benchmark.cpp create mode 100644 compactds/notes.md create mode 100644 compactds/rbbwt.cpp create mode 100644 compactds/test.cpp diff --git a/compactds/Alphabet.hpp b/compactds/Alphabet.hpp new file mode 100644 index 0000000..9c16346 --- /dev/null +++ b/compactds/Alphabet.hpp @@ -0,0 +1,196 @@ +#ifndef _MOURISL_COMPACTDS_DS_ALPHABET +#define _MOURISL_COMPACTDS_DS_ALPHABET + +#include "Utils.hpp" +#include "HuffmanCode.hpp" +#include "FixedSizeElemArray.hpp" + +typedef char ALPHABET ; + +#define ALPHABET_CODE_NOCODE 0 +#define ALPHABET_CODE_PLAIN 1 +#define ALPHABET_CODE_HUFFMAN 2 + +// The data structe for mapping alphabet +// Conceptually, all the other data structure regard the alphabet as {0,...,|sigma|-1}, +// This function serves to map these numeric alphabet to actually alphabet(char by default). +namespace compactds { +class Alphabet +{ +private: + size_t _space ; + int _method ; + ALPHABET *_alphabetList ; + int _alphabetCode[1<<(sizeof(ALPHABET) * 8)] ; + short _alphabetCodeLen[1<<(sizeof(ALPHABET) * 8)] ; // the length of encoded bits. + size_t _n ; + + HuffmanCode huffmanCode ; +public: + Alphabet() + { + _n = _space = 0 ; + _method = ALPHABET_CODE_NOCODE ; + } + + ~Alphabet() { Free() ; } + + void Free() + { + if (_n != 0) + free(_alphabetList) ; + } + + size_t GetSpace() { return _space + sizeof(*this); } + + // Use plain binary number sequentially for the characters in s. + // @return: code length + int InitFromList(const ALPHABET *s, size_t n) + { + size_t i ; + this->_n = n ; + _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * n) ; + _space = sizeof(ALPHABET) * n ; + memset(_alphabetCode, 0, sizeof(_alphabetCode)) ; + memset(_alphabetCodeLen, 0, sizeof(_alphabetCodeLen)) ; + + int codeLen = Utils::Log2Ceil(n) ; + for (i = 0 ; i < n ; ++i) + { + _alphabetList[i] = s[i] ; + _alphabetCode[ (int)s[i] ]= i ; + _alphabetCodeLen[ (int)s[i] ] = codeLen ; + } + _method = ALPHABET_CODE_PLAIN ; + return codeLen ; + } + + // s: list of the characters + // freq: list of the frequencies for each character + // n: number of character + void InitHuffman(const ALPHABET *s, const uint64_t *freq, size_t n) + { + size_t i ; + this->_n = n ; + _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * n) ; + for (i = 0 ; i < n ; ++i) + _alphabetList[i] = s[i] ; + + huffmanCode.InitFromFrequency(freq, n) ; + + for (i = 0 ; i < n ; ++i) + { + int l ; + _alphabetCode[i] = huffmanCode.Encode(i, l) ; + _alphabetCodeLen[i] = l ; + } + _method = ALPHABET_CODE_HUFFMAN ; + } + + size_t GetAlphabetCapacity() const + { + if (ALPHABET_CODE_NOCODE) + return 0 ; + else if (ALPHABET_CODE_PLAIN) + return 1<<(Utils::Log2Ceil(_n)) ; + else if (ALPHABET_CODE_HUFFMAN) + return _n ; + return 0 ; + } + + size_t GetSize() const + { + return _n ; + } + + // l: how many bits used in the coding + ALPHABET Decode(WORD c, int l) const + { + //l = _alphabetCodeLen[ (int)_alphabetList[i] ] ; + size_t i ; + if (_method == ALPHABET_CODE_NOCODE) + { + return c ; + } + + if (_method == ALPHABET_CODE_PLAIN) + i = c ; + else + i = huffmanCode.Decode(c, l) ; + return _alphabetList[i] ; + } + + WORD Encode(ALPHABET c, int &l) const + { + if (_method == ALPHABET_CODE_NOCODE) + { + //l = Utils::CountBits(c) ; + l = 0 ; + return c ; + } + else + { + l = _alphabetCodeLen[(int)c] ; + return _alphabetCode[(int)c] ; + } + } + + WORD Encode(ALPHABET c) const + { + if (_method == ALPHABET_CODE_NOCODE) + return c ; + else + return _alphabetCode[(int)c] ; + } + + // test whether the alphabet c is in the list + bool IsIn(ALPHABET c) const + { + size_t i ; + for (i = 0 ; i < _n ; ++i) + if (_alphabetList[i] == c) + return true ; + return false ; + } + + Alphabet& operator=(const Alphabet &in) + { + Free() ; + _n = in._n ; + _space = in._space ; + _method = in._method ; + + _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * _n) ; + _space = sizeof(ALPHABET) * _n ; + memcpy(_alphabetList, in._alphabetList, sizeof(ALPHABET) * _n ) ; + memcpy(_alphabetCode, in._alphabetCode, sizeof(_alphabetCode)) ; + memcpy(_alphabetCodeLen, in._alphabetCodeLen, sizeof(_alphabetCodeLen)) ; + huffmanCode = in.huffmanCode ; + return *this ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _space) ; + SAVE_VAR(fp, _method) ; + SAVE_VAR(fp, _n) ; + fwrite(_alphabetList, sizeof(ALPHABET), _n, fp) ; + fwrite(_alphabetCode, sizeof(_alphabetCode[0]), 1<<(sizeof(ALPHABET) * 8), fp) ; + fwrite(_alphabetCodeLen, sizeof(_alphabetCodeLen[0]), 1<<(sizeof(ALPHABET) * 8), fp) ; + } + + void Load(FILE *fp) + { + Free() ; + LOAD_VAR(fp, _space) ; + LOAD_VAR(fp, _method) ; + LOAD_VAR(fp, _n) ; + + _alphabetList = (ALPHABET *)malloc(sizeof(ALPHABET) * _n) ; + fread(_alphabetList, sizeof(ALPHABET), _n, fp) ; + fread(_alphabetCode, sizeof(_alphabetCode[0]), 1<<(sizeof(ALPHABET) * 8), fp) ; + fread(_alphabetCodeLen, sizeof(_alphabetCodeLen[0]), 1<<(sizeof(ALPHABET) * 8), fp) ; + } +} ; +} +#endif diff --git a/compactds/Bitvector.hpp b/compactds/Bitvector.hpp new file mode 100644 index 0000000..9757a32 --- /dev/null +++ b/compactds/Bitvector.hpp @@ -0,0 +1,70 @@ +#ifndef _MOURISL_COMPACTDS_BITVECTOR +#define _MOURISL_COMPACTDS_BITVECTOR + +#include "Utils.hpp" + +#define BITVECTOR_DEFAULT_SELECT_SPEED 3 + +// The overall functionality of bitvector +namespace compactds { +class Bitvector +{ +protected: + size_t _space ; +public: + Bitvector() {_space = 0 ;} + ~Bitvector() {} + + // W is the plain bit vector + virtual void Init(const WORD *W, const size_t n) = 0 ; + virtual void Free() = 0 ; + virtual size_t GetSpace() = 0; + + // Return the ith bits (0-based) + virtual int Access(size_t i) const = 0 ; + // Return the number of 1s before i + virtual size_t Rank1(size_t i, int inclusive = 1) const = 0 ; + // Return the index of th i-th (i is 1-based, so rank and select are inversible) 1 + // it is for 1 only for now + virtual size_t Select(size_t i) const = 0 ; + + // Return the rightmost 1 in [0..i] + // TODO: Handle the boundary cases + size_t Pred(size_t i) const + { + return Select( Rank1(i) ) ; + } + + // Return the leftmost 1 in [i..n-1] + size_t Succ(size_t i) const + { + return Select( Rank1(i, /*inclusive=*/0) + 1 ) ; + } + + // Return the number of 0s before i + size_t Rank0(size_t i, int inclusive = 1) const + { + // There are i+1 elements in [0..i], and Rank(i) of them are 1's + return i + inclusive - Rank1(i, inclusive) ; + } + + size_t Rank(int type, size_t i, int inclusive = 1) const + { + if (type == 1) + return Rank1(i, inclusive) ; + else + return Rank0(i, inclusive) ; + } + + virtual void Save(FILE *fp) + { + SAVE_VAR(fp, _space) ; + } + + virtual void Load(FILE *fp) + { + LOAD_VAR(fp, _space) ; + } +} ; +} +#endif diff --git a/compactds/Bitvector_Compressed.hpp b/compactds/Bitvector_Compressed.hpp new file mode 100644 index 0000000..fe0d011 --- /dev/null +++ b/compactds/Bitvector_Compressed.hpp @@ -0,0 +1,370 @@ +#ifndef _MOURISL_COMPACTDS_BITVECTOR_COMPRESSED +#define _MOURISL_COMPACTDS_BITVECTOR_COMPRESSED + +#include "Utils.hpp" +#include "Bitvector.hpp" + +#include "FixedSizeElemArray.hpp" + +// The compressed bitvector based on chaptor 4 +// This seems to be the RRR bitvector +namespace compactds { +class Bitvector_Compressed : public Bitvector +{ +private: + int _b ; // block size for bit vector + int _pb ; // block size for partial sum array _P + size_t _n ; // the total raw length of the bits + + // Variables for compress the bit vector + FixedSizeElemArray _C ; // the array for the count of bits + WORD *_O ; // encoded offsets within each block + size_t *_P ; // partial sum on offset array _O + + uint64_t **_choose ; // _C(i, j) + int *_L ; // the required bits for _O for each _Ci + + // Variables for ranking query + uint64_t *_R ; // precomputed rank (right-exclusive). R and _P are aligned + + // Variables for selection + size_t *_S ; + int _sb ; // the block size for selection + size_t _sBlockCnt ; + + int _selectSpeed ; + + void EncodeBits(const WORD &B, int &c, size_t &o) const + { + int i ; + WORD maskedB = B & MASK(_b) ; + int onecnt = Utils::Popcount(maskedB) ; + o = 0 ; + c = 0 ; + for (i = _b - 1 ; i >= 0 ; --i) + { + if ((maskedB >> i) & 1) + { + o += _choose[i][onecnt - c] ; + ++c ; + } + } + } + + WORD DecodeBits(int c, size_t o) const + { + WORD ret = 0 ; + int usedOnes = 0 ; + int i ; + for (i = _b - 1 ; i >= 0 ; --i) + { + ret <<= 1 ; + if (o >= _choose[i][c - usedOnes]) + { + ret |= 1 ; + o -= _choose[i][c - usedOnes]; + ++usedOnes ; + } + } + + return ret ; + } + + void InitChoose(int b) + { + int i, j ; + + // Build the _choose array + _choose = (uint64_t**)malloc(sizeof(*_choose) * (b+1) ) ; + _space += sizeof(*_choose) * (b + 1) ; + for (i = 0 ; i <= b ; ++i) + { + _choose[i] = (uint64_t*)malloc(sizeof(**_choose) * (i + 2)) ; + _space += sizeof(**_choose) * (i + 2) ; + } + for (i = 0 ; i <= b ; ++i) + { + _choose[i][0] = 1 ; + for (j = 1 ; j < i ; ++j) + { + _choose[i][j] = _choose[i - 1][j - 1] + _choose[i - 1][j] ; + } + _choose[i][i] = 1 ; + _choose[i][i + 1] = 0 ; + } + + _L = (int *)malloc(sizeof(*_L) * (b + 1)) ; + _space += sizeof(*_L) * (b + 1) ; + for (i = 0 ; i <= b ; ++i) + { + // There are _choose[b][i] different combinations for each _C_i + _L[i] = Utils::Log2Ceil(_choose[b][i]) ; + } + } + +public: + Bitvector_Compressed() + { + _n = _b = _pb = _sb = 0 ; + _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; + } + ~Bitvector_Compressed() {Free();} + + // blockSize should be 2^x - 1.so _C can be fully utilized + void SetBlockSize(int blockSize) + { + _b = blockSize ; + } + + void SetPsumBlockSize(int psumBlockSize) + { + _pb = psumBlockSize ; + } + + void SetSelectBlockSize(int selectBlockSize) + { + _sb = selectBlockSize ; + } + + void SetSelectSpeed(int in) + { + _selectSpeed = in ; + } + + // W is the plain bit vector + void Init(const WORD *W, const size_t n) + { + size_t i, j ; + this->_n = n ; + _space = 0 ; + + if (_b <= 0) + _b = 8 * sizeof(WORD) - 1 ; + + if (_pb <= 0) + _pb = 8 * sizeof(size_t) ; + + if (_sb <= _b) + _sb = 8 * sizeof(size_t) * 8 * sizeof(size_t) ; + size_t blockCnt = DIV_CEIL(n, _b) ; + InitChoose(_b) ; // Initialize _choose and _L + + _C.Malloc(Utils::Log2Ceil(_b + 1), blockCnt) ; + _space += _C.GetSpace() ; + + // _Calculate the size for _O + size_t offsetsSize = 0 ; + int maxOneCntInBlock = 0 ; + uint64_t totalOneCnt = 0 ; + for (i = 0 ; i < _n ; i += _b) + { + int onecnt = Utils::Popcount( Utils::BitsRead(W, i, (i + _b < _n ? i + _b - 1 : _n - 1)) ) ; + + offsetsSize += _L[onecnt] ; + totalOneCnt += onecnt ; + if (onecnt > maxOneCntInBlock) + maxOneCntInBlock = onecnt ; + } + + _O = Utils::MallocByBits(offsetsSize) ; + _space += Utils::BitsToWordBytes(offsetsSize) ; + + size_t psumBlockCnt = DIV_CEIL(blockCnt, _pb) ; + _P = (size_t *)malloc(sizeof(size_t) * psumBlockCnt) ; + _space += sizeof(size_t) * psumBlockCnt ; + + _R = (uint64_t *)malloc(sizeof(uint64_t) * psumBlockCnt) ; + _space += sizeof(uint64_t) * psumBlockCnt ; + + if (_selectSpeed > 0) + { + _sBlockCnt = DIV_CEIL(totalOneCnt, _sb) ; + _S = (size_t *)malloc(sizeof(size_t) * _sBlockCnt) ; + _space += sizeof(uint64_t) * _sBlockCnt ; + } + + // Build the _C, _O, _P that compress the bit vector + // Also build the data structures for rank and selections + // j is used to index _O + size_t blocki ; + uint64_t onecntSum = 0 ; + bool locateFirstOne = false ; + for (i = 0, j = 0, blocki = 0 ; i < _n ; i += _b, ++blocki) + { + WORD bits = Utils::BitsRead(W, i, (i + _b < _n ? i + _b - 1 : _n - 1)) ; + + int tmpc ; + size_t tmpo ; + EncodeBits(bits, tmpc, tmpo) ; + + //printf("%d %llu. %llu\n", tmpc, tmpo, bits) ; + if (blocki % _pb == 0) + { + _P[blocki/_pb] = j ; + } + _C.Write(blocki, tmpc) ; + if (_L[tmpc] > 0) + { + Utils::BitsWrite(_O, j, j + _L[tmpc] - 1, tmpo) ; + } + j += _L[tmpc] ; + + // _Process the information for rank operation + if (blocki % _pb == 0) + _R[blocki / _pb] = onecntSum ; + + // _Process the information for select operation + if (_selectSpeed && (onecntSum / _sb != (onecntSum + tmpc) / _sb + || (!locateFirstOne && tmpc > 0))) + { + int localOneCnt = 0 ; + int l = 0 ; + for (l = 0 ; l < _b ; ++l) + if ((bits >> l)&1) + { + if ((onecntSum + localOneCnt) % _sb == 0) + { + _S[(onecntSum + localOneCnt) / _sb] = i + l ; + break ; + } + ++localOneCnt ; + } + locateFirstOne = true ; + } + onecntSum += tmpc ; + } + } + + void Free() + { + _C.Free() ; + if (_n != 0) + { + int i ; + for (i = 0 ; i <= _b ; ++i) + free(_choose[i]) ; + free(_choose) ; + free(_L) ; + + free(_O) ; + free(_P) ; + + free(_R) ; + + free(_S) ; + _n = 0 ; + } + } + + // Return the ith bits (0-based) + int Access(size_t i) const + { + // Get the partial sum from _P + size_t bi = i / _b ; + size_t pi = bi / _pb ; + + size_t j ; + int blockc = _C.Read(bi) ; + if (blockc == 0) + return 0 ; + else if (blockc == _b) + return 1 ; + + size_t blocko ; + size_t os = _P[pi] ; // start position in o + // j to index the block offsets + for (j = pi * _pb ; j < bi ; ++j) + os += _L[ _C.Read(j) ] ; + blocko = Utils::BitsRead(_O, os, os + _L[blockc] - 1) ; + + WORD bits = DecodeBits(blockc, blocko) ; + + int residuali = i % _b ; + return (bits >> residuali) & 1 ; + } + + // Return the _number of 1s before i + size_t Rank1(size_t i, int inclusive = 1) const + { + size_t j ; + size_t bi = i / _b ; // index for block + size_t ri = bi / _pb ; // index for R + + size_t ret = _R[ri] ; + size_t os = _P[ri] ; + for (j = ri * _pb ; j < bi ; ++j) + { + int onecnt = _C.Read(j) ; + ret += onecnt ; + os += _L[onecnt] ; + } + int blockc = _C.Read(bi) ; + if (blockc == 0) + return ret ; + else if (blockc == _b) + return ret + i%_b + inclusive ; + else + { + size_t blocko = Utils::BitsRead(_O, os, os + _L[blockc] - 1) ; + + WORD bits = DecodeBits(blockc, blocko) ; + + int residuali = i % _b ; + return ret + Utils::Popcount( bits & MASK_WCHECK(residuali + inclusive) ) ; + } + } + + // Return the index of th i-th (1-based, so rank and select are inversible) 1 + size_t Select(size_t i) const + { + if (i == 0) + return POSITIVE_INF ; + // Unlike the uncompressed case, binary search might be less efficient + // because we _need to sequentially find the appropriate _O in rank. + size_t j ; + size_t si = (i-1) / _sb ; + size_t bi = _S[si] / _b ; // it aligns to block bi + size_t pi = bi / _pb ; // block bi belongs to the _P-block recording the offset in _O + + size_t os = _P[pi] ; + // We rollback the index a little bit to align with the information of _O + uint64_t onecntSum = _R[pi] ; // Another bless that R and _P are aligned + // j index the block + for (j = pi * _pb ; j * _b < _n ; ++j) + { + int blockc = _C.Read(j) ; + if (onecntSum + blockc >= i) + { + // the desired 1 is in this block + size_t blocko = Utils::BitsRead(_O, os, os + _L[blockc] - 1) ; + WORD bits = DecodeBits(blockc, blocko) ; + + int l ; + for (l = 0 ; l < _b ; ++l) + if ((bits >> l) & 1) + { + ++onecntSum ; + if (onecntSum == i) + return j * _b + l ; + } + break ; + } + os += _L[blockc] ; + onecntSum += blockc ; + } + return 0 ; + } + + size_t Select(int type, size_t i) const + { + return 0 ; + } + + size_t GetSpace() + { + return _space + sizeof(*this) ; + } +} ; +} + +#endif diff --git a/compactds/Bitvector_Plain.hpp b/compactds/Bitvector_Plain.hpp new file mode 100644 index 0000000..8e37f2d --- /dev/null +++ b/compactds/Bitvector_Plain.hpp @@ -0,0 +1,206 @@ +#ifndef _MOURISL_COMPACTDS_BITVECTOR_PLAIN +#define _MOURISL_COMPACTDS_BITVECTOR_PLAIN + +#include "Utils.hpp" +#include "Bitvector.hpp" + +#include "DS_Rank.hpp" +#include "DS_Select.hpp" + +// The bitvector with +namespace compactds { +class Bitvector_Plain : public Bitvector +{ +private: + size_t _n ; // the total raw length of the bits + + // Variables for the bit vector + WORD *_B ; // bitvector packed in WORD array + + // Variables for _ranking query + DS_Rank9 _rank ; + int _rb ; + + // Variables for _selection + DS_Select _select ; + int _sb ; + + int _selectSpeed ; + int _selectTypeSupport ; + +public: + Bitvector_Plain() + { + _n = _rb = _sb = 0 ; + _B = NULL ; + _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; + _selectTypeSupport = 3 ; + } + ~Bitvector_Plain() {Free();} + + void SetRankBlockLength(int rBlockSize) + { + _rb = rBlockSize ; + } + + void SetSelectBlockLength(int sBlockSize) + { + _sb = sBlockSize ; + } + + void SetSelectSpeed(int selectSpeed) + { + this->_selectSpeed = selectSpeed ; + } + + void SetSelectTypeSupport(int selectTypeSupport) + { + this->_selectTypeSupport = selectTypeSupport ; + } + + + void Malloc(const size_t &n) + { + this->_n = n ; + _B = Utils::MallocByBits(n) ; + + _space = Utils::BitsToWordBytes(n) ; + } + + void Free() + { + if (_B != NULL) + { + free(_B) ; + _B = NULL ; + } + _rank.Free() ; + _select.Free() ; + _n = 0 ; + } + + // Use with caution that the _rank and + void BitSet(size_t i) + { + Utils::BitSet(_B, i) ; + } + + void BitClear(size_t i) + { + Utils::BitClear(_B, i) ; + } + + // W is the plain bit vector + void Init(const WORD *W, const size_t n) + { + _n = n ; + Malloc(n) ; + memcpy(_B, W, Utils::BitsToWordBytes(n)) ; + + Init() ; + } + + // This is for when _B is already allocated + void Init() + { + _space = Utils::BitsToWordBytes(_n) ; + _rank.Free() ; + _select.Free() ; + //_rank.Init(_rb, _B, _n) ; + _rank.Init(_B, _n) ; + _space += _rank.GetSpace() - sizeof(_rank) ; + _select.Init(_sb, _B, _n, _selectSpeed, _selectTypeSupport) ; + _space += _select.GetSpace() - sizeof(_select) ; + } + + // Return the ith bits (0-based) + int Access(size_t i) const + { + return Utils::BitRead(_B, i) ; + } + + // Return the number of 1s before i + size_t Rank1(size_t i, int inclusive = 1) const + { + return _rank.Query(i, _B, _n, inclusive) ; + } + + // Return the index of th i-th (this i is 1-based, so rank and select are inversible) 1 + size_t Select(size_t i) const + { + return _select.Query(i, _rank, _B, _n) ; + } + + size_t Select(int type, size_t i) const + { + if (type == 1) + return _select.Query(i, _rank, _B, _n) ; + else + return _select.Query0(i, _rank, _B, _n) ; + } + + // Pred/successor on bit 0 + size_t Pred0(size_t i) const + { + return Select(0, Rank(0, i)) ; + } + + size_t Succ0(size_t i) const + { + return Select(0, Rank(0, i, 0) + 1) ; + } + + size_t GetSpace() + { + return _space + sizeof(*this) ; + } + + const WORD *GetData() const + { + return _B ; + } + + void Save(FILE *fp) + { + Bitvector::Save(fp) ; + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _rb) ; + SAVE_VAR(fp, _sb) ; + SAVE_VAR(fp, _selectSpeed) ; + SAVE_VAR(fp, _selectTypeSupport) ; + if (_n > 0) + { + fwrite(_B, sizeof(*_B), Utils::BitsToWords(_n), fp) ; + _rank.Save(fp) ; + _select.Save(fp) ; + } + } + + void Load(FILE *fp) + { + Free() ; + Bitvector::Load(fp) ; + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _rb) ; + LOAD_VAR(fp, _sb) ; + LOAD_VAR(fp, _selectSpeed) ; + LOAD_VAR(fp, _selectTypeSupport) ; + + if (_n > 0) + { + _B = Utils::MallocByBits(_n) ; + fread(_B, sizeof(*_B), Utils::BitsToWords(_n), fp) ; + _rank.Load(fp) ; + _select.Load(fp) ; + } + else + { + _B = NULL ; + //_rank.Free() ; + //_select.Free() ; + } + } +} ; +} + +#endif diff --git a/compactds/Bitvector_RunLength.hpp b/compactds/Bitvector_RunLength.hpp new file mode 100644 index 0000000..dfe8c07 --- /dev/null +++ b/compactds/Bitvector_RunLength.hpp @@ -0,0 +1,228 @@ +#ifndef _MOURISL_COMPACTDS_BITVECTOR_RUNLENGTH +#define _MOURISL_COMPACTDS_BITVECTOR_RUNLENGTH + +#include "Utils.hpp" + +#include "Bitvector.hpp" +#include "Bitvector_Sparse.hpp" +#include "PartialSum.hpp" +#include "SimpleVector.hpp" + +// The run-length bitvector built upon the sparse bit vector +// Based on section: +namespace compactds { +class Bitvector_RunLength: public Bitvector +{ +protected: + bool _zerofirst ; // whether the run-length array starts with 0 or _not. + int _partialSumSpeed ; + size_t _n ; // total _number of bits + size_t _rcnt ; // the _number of runs + + PartialSum _R ; // the partial sum of runs + PartialSum _O ; // the partial sum of 1s +public: + Bitvector_RunLength() + { + _partialSumSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; + } + ~Bitvector_RunLength() {} + + void Free() + { + _R.Free() ; + _O.Free() ; + } + + size_t GetSpace() + { + return _R.GetSpace() - sizeof(_R) + + _O.GetSpace() - sizeof(_O) + sizeof(*this) ; + } + + void SetSelectSpeed(int speed) + { + } + + void SetPartialSumSpeed(int _partialSumSpeed) + { + _R.SetSpeed(_partialSumSpeed) ; + _O.SetSpeed(_partialSumSpeed) ; + } + + void SetSupportSelect(int supportSelect) + { + _O.SetSupportSearch(supportSelect) ; + } + + // W is the plain bit vector + void Init(const WORD *W, const size_t n) + { + size_t i ; + if (n == 0) + return ; + _n = n ; + _zerofirst =false ; + if (Utils::BitRead(W, 0) == 0) + _zerofirst = true ; + + WORD *B = Utils::MallocByBits(n + 1) ; // bits for the sums of runs + WORD *oneB = Utils::MallocByBits(n + 1) ; // bits for the sums of runs of 1s + Utils::BitSet(B, 0) ; + Utils::BitSet(oneB, 0) ; + size_t oneLen = 0 ; // run length for ones + int prevc = 0 ; + if (!_zerofirst) + { + prevc = 1 ; + oneLen = 1 ; + } + + for (i = 1 ; i < n ; ++i) + { + int c = Utils::BitRead(W, i) ; + if (c) + ++oneLen ; + if (c != prevc) + { + Utils::BitSet(B, i) ; + if (c == 0) // previous c == 1 + Utils::BitSet(oneB, oneLen) ; + } + prevc = c ; + } + + Utils::BitSet(B, n) ; + Utils::BitSet(oneB, oneLen) ; + _R.InitFromBitvector(B, n + 1) ; + _O.InitFromBitvector(oneB, oneLen + 1) ; + free(B) ; + free(oneB) ; + + /*size_t len = 1 ; + SimpleVector rlens ; + rlens.Reserve(_n / WORDBITS + 1) ; + for (i = 1 ; i < _n ; ++i) + { + if (Utils::BitRead(W, i) != Utils::BitRead(W, i - 1)) + { + rlens.PushBack(len) ; + len = 1 ; + } + else + ++len ; + } + rlens.PushBack(len) ; + InitFromRunLength(rlens.BeginAddress(), rlens.Size(), _n, _zerofirst) ;*/ + } + + void InitFromRunLength(const int *rlens, const size_t rcnt, const size_t n, const bool zerofirst) + { + this->_rcnt = rcnt ; + this->_n = n ; + this->_zerofirst = zerofirst ; + + _R.Init(rlens, _rcnt) ; + + size_t i = 0 ; + uint64_t *oneSums = (uint64_t *)malloc(sizeof(*oneSums) * (_rcnt / 2 + 2)); + + if (_zerofirst) + i = 1 ; + + uint64_t psum = 0 ; + for ( ; i < _rcnt ; i += 2) + { + oneSums[i/2] = psum ; + psum += rlens[i] ; + } + oneSums[i/2] = psum ; + _O.InitFromPartialSum(oneSums, i/2) ; + free(oneSums) ; + } + + // Return the ith bits (0-based) + int Access(size_t i) const + { + size_t ri = _R.Search(i) ; + int inOne = (ri&1) ; //whether ri is block for 1 or 0 + if (!_zerofirst) + inOne = 1 - inOne ; + return inOne ; + } + + // Return the _number of 1s before i + size_t Rank1(size_t i, int inclusive = 1) const + { + size_t ri = _R.Search(i) ; // run-length block index + size_t oi = ri / 2 ; + int inOne = (ri&1) ; //whether ri is block for 1 or 0 + if (!_zerofirst) + inOne = 1 - inOne ; + if (!inOne) + { + // Each small block is a (00..11..) xx, + // or (11..00..) runs, + // so we _need to adjust whether we want the sum of 1's include the current small bock or _not. + if (_zerofirst) + return _O.Sum(oi) ; + else + return _O.Sum(oi + 1) ; + } + else + { + //The sum of 1s before current run and the _number of 1s in the current block + return _O.Sum(oi) + (i - _R.Sum(ri) + inclusive) ; + } + } + + // Return the index of th i-th (i is 1-based, so rank and select are inversible) 1 + // Did not have a select mode for 0 here. + size_t Select(size_t i) const + { + if (i == 0) + return POSITIVE_INF ; + + --i ; + size_t oi = _O.Search(i) ; + // Map oi back to the ri + size_t ri = 2 * oi ; + if (_zerofirst) + ++ri ; + return _R.Sum(ri) + (i - _O.Sum(oi)) ; + } + + size_t Select(int type, size_t i) const + { + if (type == 1) + return Select(i) ; + else + return POSITIVE_INF ; + } + + void Save(FILE *fp) + { + Bitvector::Save(fp) ; + SAVE_VAR(fp, _zerofirst) ; + SAVE_VAR(fp, _partialSumSpeed) ; + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _rcnt) ; + _R.Save(fp) ; + _O.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + Bitvector::Load(fp) ; + LOAD_VAR(fp, _zerofirst) ; + LOAD_VAR(fp, _partialSumSpeed) ; + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _rcnt) ; + _R.Load(fp) ; + _O.Load(fp) ; + } +} ; +} + +#endif diff --git a/compactds/Bitvector_Sparse.hpp b/compactds/Bitvector_Sparse.hpp new file mode 100644 index 0000000..c20ad35 --- /dev/null +++ b/compactds/Bitvector_Sparse.hpp @@ -0,0 +1,336 @@ +#ifndef _MOURISL_COMPACTDS_BITVECTOR_SPARSE +#define _MOURISL_COMPACTDS_BITVECTOR_SPARSE + +#include "Utils.hpp" +#include "Bitvector.hpp" +#include "FixedSizeElemArray.hpp" +#include "Bitvector_Plain.hpp" + +// The very sparse bitvector based on chaptor 4.4 +// This data structure seems can be used for set predecessor query when +// the elements are increasing? +// +// This is a super clever data structure. Come and think about it from time to time. +namespace compactds { +class Bitvector_Sparse: public Bitvector +{ +private: + size_t _n ; // total length of the bit vector + size_t _onecnt ; // number of 1s in the bit vector + size_t _lastOneIdx ; // the index of the last one + int _lowerBits ; // the split for lower and upper bits + + FixedSizeElemArray _L ; // stores lower bits (of size r) + Bitvector_Plain _H ; // stores the higher bits + + int _hSelectSpeed ; // this is the speed for H +public: + Bitvector_Sparse() + { + _lowerBits = 0 ; + _hSelectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; + } + ~Bitvector_Sparse() + { + Free() ; + } + + void Free() + { + _n = 0 ; + _L.Free() ; + _H.Free() ; + } + + size_t GetSpace() + { + return _space + _L.GetSpace() - sizeof(_L) + + _H.GetSpace() - sizeof(_H) + sizeof(*this) ; + } + + void SetLowerBits(int lowerBits) + { + this->_lowerBits = lowerBits ; + } + + // the speed is for H.select + void SetSpeed(int speed) + { + _H.SetSelectSpeed(speed) ; + } + + void SetSupportRank(bool supportRank) + { + _H.SetSelectTypeSupport(supportRank ? 3 : 2) ; + } + + size_t GetLastOneIdx() + { + return _lastOneIdx ; + } + + size_t GetOneCnt() + { + return _onecnt ; + } + + // Init directly from the bit vector + // W is the plain bit vector + void Init(const WORD *W, const size_t n) + { + size_t i, k ; + size_t wordCnt = Utils::BitsToWords(n) ; + + _n = n ; + _onecnt = 0 ; + for (i = 0 ; i < wordCnt ; ++i) + _onecnt += Utils::Popcount(W[i]) ; + + if (_onecnt == 0 || wordCnt == 0) + { + _lastOneIdx = 0 ; + return ; + } + + // Get the last 1 + i = wordCnt - 1 ; + while (1) + { + if (W[i] == 0) + { + --i ; + continue ; + } + else + { + int j ; + for (j = WORDBITS - 1 ; j >= 0 ; --j) + { + if ((W[i] >> j) & 1) + { + _lastOneIdx = i * WORDBITS + j ; + break ; + } + } + break ; + } + if (i == 0) + break ; + else + --i ; + } + + if (_lowerBits == 0) + _lowerBits = int(log((double)n / _onecnt) / log(2.0)) ; + + if (_lowerBits < 1) + _lowerBits = 1 ; + _L.Malloc(_lowerBits, _onecnt) ; + + size_t hsize = (_lastOneIdx >> _lowerBits) + _onecnt + 1; // need +1 here to accommdate the max value + ++hsize ; // Plus one here is because we want to append a 0 to the last block + _H.Malloc(hsize) ; + + k = 0 ; + for (i = 0 ; i < n ; i += WORDBITS) + { + WORD w = W[i/WORDBITS] ; + if (w == 0) + continue ; + size_t j ; + for (j = 0 ; j < WORDBITS && i + j < n ; ++j) + if ((w >>j) & 1) + { + _L.Write(k, (i + j) & MASK(_lowerBits)) ; + _H.BitSet(((i + j) >> _lowerBits) + k) ; + ++k ; + } + } + _H.Init() ; + } + + // Init from the know positions of 1s + void InitFromOnes(const uint64_t *S, const size_t onecnt, const size_t n) + { + size_t i ; + + _n = n ; + _onecnt = onecnt ; + if (_onecnt > 0) + _lastOneIdx = S[onecnt - 1] ; + else + { + _lastOneIdx = 0 ; + return ; + } + + if (_lowerBits == 0) + _lowerBits = int(log((double)n / onecnt) / log(2.0)) ; + + if (_lowerBits < 1) + _lowerBits = 1 ; + _L.Malloc(_lowerBits, onecnt) ; + + size_t hsize = (S[onecnt - 1] >> _lowerBits) + onecnt + 1; // need +1 here to accommdate the max value + ++hsize ; // Plus one here is because we want to append a 0 to the last block + _H.Malloc(hsize) ; + for (i = 0 ; i < onecnt ; ++i) + { + _L.Write(i, S[i] & MASK(_lowerBits)) ; + _H.BitSet((S[i] >> _lowerBits) + i) ; + } + _H.Init() ; + } + + // Return the ith bits (0-based) + int Access(size_t i) const + { + if (Pred(i) == i) + return 1 ; + else + return 0 ; + } + + // Return the number of 1s before i + size_t Rank1(size_t i, int inclusive = 1) const + { + if (inclusive == 0) + { + if (i == 0) + return 0 ; + else + --i ; + } + + if (i >= _lastOneIdx) // this should contains the case that i>=n + return _onecnt ; + + size_t iH = i >> _lowerBits ; + size_t iL = i & MASK(_lowerBits) ; + size_t l, m, r ; + + // We don't want to +1 for iH in select because the + // 0 marks the beginning the block with starts with iH<= last one index, H.Select(iH)==H.Select(iH+1), + // then l>r. + // Fortunately, we handle this case at the beginning. + if (iH == 0 || _H.Access( selectIH + 1 ) != 0) + r = _H.Select(0, iH + 1) - iH ; + else + r = selectIH + 1 - iH ; + + if (l == r || _L.Read(l) > iL) + { + // The current r block is empty + // or the first element in the block is greater than what we search for. + // So the number of 1s before current block (l) is the answer + return l ; + } + + // r points to the start of the next r block, so we need -1 + // to make it match with the end of the current block + // The l==r test above makes sure r-1 is non-negative here. + --r ; + while (l <= r) + { + m = (l + r) / 2 ; + if (_L.Read(m) <= iL) + l = m + 1 ; + else + { + if (r == 0) + break ; // the test before the binary search make sure at least one element + // in the block is less than the desired target, + // so we can directly termiante the binary search. + else + r = m - 1 ; + } + } + return l ; // l-1 is the last element index <= the desired one, so l is the number element + } + + // Return the index of th i-th (i is 1-based, so rank and select are inversible) 1 + size_t Select(size_t i) const + { + if (i > _onecnt) + return _lastOneIdx ; + if (i == 0) + return POSITIVE_INF ; + // Use (i-1) instead of i to convert the 1-based to 0-based, which is + // the base when creating H. + return ((_H.Select(1, i) - (i - 1)) << _lowerBits) + _L.Read(i - 1) ; + } + + size_t Select(int type, size_t i) const + { + if (type == 1) + return Select(i) ; + else + { + // Sadly, we can only do plain binary search + // Haven't tested it yet. + // Don't recommend this operation. + size_t l = 0 ; + size_t r = _n - 1 ; + size_t m ; + + while (l <= r) + { + m = (l + r) / 2 ; + if (m - Select(m) < i) + l = m + 1 ; + else + { + if (m == 0) + return 0 ; + else + r = m - 1 ; + } + } + return r + 1 ; + } + } + + void Save(FILE *fp) + { + Bitvector::Save(fp) ; + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _onecnt) ; + SAVE_VAR(fp, _lastOneIdx) ; + SAVE_VAR(fp, _lowerBits) ; + SAVE_VAR(fp, _hSelectSpeed) ; + _L.Save(fp) ; + _H.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + Bitvector::Load(fp) ; + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _onecnt) ; + LOAD_VAR(fp, _lastOneIdx) ; + LOAD_VAR(fp, _lowerBits) ; + LOAD_VAR(fp, _hSelectSpeed) ; + _L.Load(fp) ; + _H.Load(fp) ; + } +} ; +} + +#endif diff --git a/compactds/CompactMapper.hpp b/compactds/CompactMapper.hpp new file mode 100644 index 0000000..661f5f4 --- /dev/null +++ b/compactds/CompactMapper.hpp @@ -0,0 +1,130 @@ +#ifndef _MOURISL_COMPACTDS_COMPACTMAPPER +#define _MOURISL_COMPACTDS_COMPACTMAPPER + +// Map a set of m distinct elements to [0,m-1] +#include +#include +#include + +#include "FixedSizeElemArray.hpp" +#include "Bitvector_Plain.hpp" +#include "Bitvector_Sparse.hpp" + +namespace compactds { +class CompactMapper +{ +private: + bool _sparse ; // whether use sparse representation + Bitvector_Plain _P ; + Bitvector_Sparse _S ; + size_t _m ; +public: + CompactMapper() + { + } + + ~CompactMapper() + { + Free() ; + } + + size_t GetSpace(int inclusive = true) + { + return _P.GetSpace() - sizeof(_P) + _S.GetSpace() - sizeof(_S) + (inclusive ? sizeof(*this) : 0) ; + } + + void Free() + { + _P.Free() ; + _S.Free() ; + } + + void Init(const FixedSizeElemArray &a, size_t n, bool sparse) + { + size_t i ; + _sparse = sparse ; + if (sparse) + { + std::map reduceMap ; + std::vector elems ; + size_t max = 0 ; + for (i = 0 ; i < n ; ++i) + { + size_t tmp = a.Read(i) ; + if (reduceMap.find(tmp) == reduceMap.end()) + { + reduceMap[tmp] = i ; + elems.push_back(tmp) ; + if (tmp > max) + max = tmp ; + } + } + std::sort(elems.begin(), elems.end()) ; + _m = elems.size() ; + _S.InitFromOnes(elems.data(), max + 1, _m) ; + } + else + { + size_t max = 0 ; + for (i = 0 ; i < n ; ++i) + { + size_t tmp = a.Read(i) ; + if (tmp > max) + max = tmp ; + } + + _P.Malloc(max + 1) ; + for (i = 0 ; i < n ; ++i) + { + size_t tmp = a.Read(i) ; + _P.BitSet(tmp) ; + } + _P.Init() ; + _m = _P.Rank1(max) ; + } + } + + size_t GetCompactSize() const + { + return _m ; + } + + size_t Map(size_t v) const + { + if (_sparse) + return _S.Rank1(v, 0) ; + else + return _P.Rank1(v, 0) ; + } + + size_t MapBack(size_t i) const + { + if (_sparse) + return _S.Select(i + 1) ; + else + return _P.Select(i + 1) ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _sparse) ; + SAVE_VAR(fp, _m) ; + if (_sparse) + _S.Save(fp) ; + else + _P.Save(fp) ; + } + + void Load(FILE *fp) + { + LOAD_VAR(fp, _sparse) ; + LOAD_VAR(fp, _m) ; + if (_sparse) + _S.Load(fp) ; + else + _P.Load(fp) ; + } +} ; +} + +#endif diff --git a/compactds/CompressedSuffixArray.hpp b/compactds/CompressedSuffixArray.hpp new file mode 100644 index 0000000..d0e7cdc --- /dev/null +++ b/compactds/CompressedSuffixArray.hpp @@ -0,0 +1,154 @@ +#ifndef _MOURISL_COMPACTDS_COMPRESSED_SUFFIX_ARRAY +#define _MOURISL_COMPACTDS_COMPRESSED_SUFFIX_ARRAY + +#include "Bitvector_Sparse.hpp" +#include "Sequence_WaveletTree.hpp" + +namespace compactds { +class CompressedSuffixArray +{ +private: + size_t _space ; + Bitvector_Sparse *_Psi ; // Psi for each alphabet + Bitvector_Sparse _D ; // mark the positions of starting alphabet in suffix + Alphabet _alphabets ; // use plain alphabet set here. + size_t _n ; + size_t *_alphabetPartialSum ; + size_t _firstISA ; + ALPHABET _lastChr ; + WORD **_psiB ; // bits for encoding Psis + + size_t Rank(Sequence_WaveletTree &BWT, ALPHABET c, size_t p, int inclusive = 1) + { + size_t ret = BWT.Rank(c, p, inclusive) ; + // Since we do not use $, the last character in the original string + // will be moved to the _firstISA instead of the first position + // We need to move this back + // Potential future refactoring: appending an A to the end of the string + if (c == _lastChr && (p < _firstISA || (!inclusive && p == _firstISA))) + ++ret ; + return ret ; + } + +public: + CompressedSuffixArray() + { + _n = _space = 0 ; + } + ~CompressedSuffixArray() {} + + void Free() + { + if (_n > 0) + { + delete[] _Psi ; + } + + if (_psiB != NULL) + { + int alphabetSize = _alphabets.GetSize() ; + int i ; + for (i = 0 ; i < alphabetSize ; ++i) + free(_psiB[i]) ; + free(_psiB) ; + } + } + + // Allocate necessary memories for reading SAs + void Prepare(size_t n, ALPHABET *alphabetList) + { + int i ; + + _n = n ; + _alphabets.InitFromList(alphabetList, strlen(alphabetList)) ; + + int alphabetSize = _alphabets.GetSize() ; + _Psi = new Bitvector_Sparse[alphabetSize] ; + _psiB = (WORD **)malloc(sizeof(_psiB[0]) * alphabetSize) ; + for (i = 0 ; i < alphabetSize ; ++i) + _psiB[i] = Utils::MallocByBits(n) ; + _alphabetPartialSum = (size_t *)calloc(alphabetSize + 1, sizeof(size_t)) ; + } + + + // sa corresponding to SA[from..to], inclusive + void ReadSaChunk(FixedSizeElemArray &T, size_t n, size_t *sa, size_t from, size_t to) + { + size_t i ; + + if (to >= n) + to = n - 1 ; + int alphabetSize = _alphabets.GetSize() ; + size_t *alphabetCount = (size_t *)calloc(alphabetSize + 1, sizeof(size_t)) ; + for (i = from ; i <= to ; ++i) + { + size_t s = sa[i - from] ; + if (s == 0) + continue ; + int c = T.Read(s - 1) ; + Utils::BitSet(_psiB[c], s) ; + ++alphabetCount[c] ; + } + _lastChr = T.Read(n - 1) ; + for (i = 1 ; i < alphabetSize ; ++i) + alphabetCount[i] += alphabetCount[i - 1] ; + for (i = 0 ; i < alphabetSize ; ++i) + _alphabetPartialSum[i + 1] += alphabetCount[i] ; + } + + // Compress everything + void Init() + { + size_t i ; + int alphabetSize = _alphabets.GetSize() ; + + for (i = 0 ; i < alphabetSize ; ++i) + _Psi[i].Init(_psiB[i], _n) ; + + for (i = 0 ; i < alphabetSize ; ++i) + free(_psiB[i]) ; + free(_psiB) ; + _psiB = NULL ; + } + + void InitFromBWT(FixedSizeElemArray &BWT, size_t n, size_t firstISA, ALPHABET *alphabetList) + { + size_t i ; + _n = n ; + _firstISA = firstISA ; + + Prepare(n, alphabetList) ; + + // Prepare the auxiliary data, e.g. rank, for BWT + int alphabetSize = _alphabets.GetSize() ; + _alphabetPartialSum = (size_t *)calloc(alphabetSize, sizeof(size_t)) ; + + for (i = 0 ; i < n ; ++i) + ++_alphabetPartialSum[ BWT.Read(i)] ; + for (i = 1 ; i < alphabetSize ; ++i) + _alphabetPartialSum[i] += _alphabetPartialSum[i - 1] ; + for (i = alphabetSize ; i > 0 ; --i) + _alphabetPartialSum[i] = _alphabetPartialSum[i - 1] ; + _alphabetPartialSum[0] = 0 ; + _lastChr = alphabetList[ BWT.Read(firstISA) ] ; + + Sequence_WaveletTree seqBWT ; + seqBWT.Init(BWT, n, alphabetList) ; + + // Compute Psi from BWT. The Psi for each alphabet is marked on the bit array psiB + size_t lastISA = _alphabetPartialSum[ _alphabets.Encode(_lastChr) ] ; + size_t p = lastISA; + for (i = 0 ; i < n ; ++i) + { + int c = BWT.Read(p) ; + size_t lf = _alphabetPartialSum[c] + Rank(seqBWT, alphabetList[c], p) ; + Utils::BitSet(_psiB[c], p) ; // Psi[lf] = p. Psi is the inverse function of LF mapping + p = lf ; + } + + Init() ; + } +} ; +} + +#endif diff --git a/compactds/DS_InvPermutation.hpp b/compactds/DS_InvPermutation.hpp new file mode 100644 index 0000000..186d583 --- /dev/null +++ b/compactds/DS_InvPermutation.hpp @@ -0,0 +1,137 @@ +#ifndef _MOURISL_COMPACTDS_DS_INVPERMUTATION +#define _MOURISL_COMPACTDS_DS_INVPERMUTATION + +#include "Utils.hpp" +#include "Bitvector_Sparse.hpp" +#include "Bitvector_Plain.hpp" + +// The standalone data structure for inverse query on a plain permutation using the idea of short churt. +// Time complexity: O(t) +// Space complexity: O(n/t * logn) +// Based Chapter 5.1. Difference is that the book samples is one off than this implementation +// This could be also useful for encoding the inverse function of an 1-to-1 mapping + +namespace compactds { +class DS_InvPermutation +{ +private: + size_t _t ; // step size + size_t _space ; + Bitvector_Plain _B ; // mark whether a position is sampled + FixedSizeElemArray _S ; // sampled pointer with value Pi^{-t}[x] + size_t _sampledCnt ; // |_S| +public: + DS_InvPermutation() + { + _space = 0 ; + _t = 0 ; + } + + ~DS_InvPermutation() + { + Free() ; + } + + void Free() + { + _S.Free() ; + } + + size_t GetSpace() + { + return _space + _B.GetSpace() - sizeof(_B) + sizeof(*this) ; + } + + void SetSampleRate(size_t t) + { + _t = t ; + } + + void Init(size_t *Pi, size_t n) + { + if (_t == 0) + _t = Utils::Log2Ceil(n) ; + WORD *B = Utils::MallocByBits(n) ; // the label sampled positions + WORD *V = Utils::MallocByBits(n) ; // the bits mark the cycles + + size_t i, j, k ; + + _sampledCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(V, i)) + continue ; + Utils::BitSet(V, i) ; + j = Pi[i] ; + k = 1 ; + while (j != i) + { + Utils::BitSet(V, j) ; + if (k % _t == 0) + { + Utils::BitSet(B, j) ; + ++_sampledCnt ; + } + j = Pi[j] ; + ++k ; + } + + if (k > _t) // may exist dangling part, without this, the time could be 2*_t-1 + { + Utils::BitSet(B, i) ; + ++_sampledCnt ; + } + } + + _B.Init(B, n) ; + _S.Malloc(Utils::Log2Ceil(n), _sampledCnt) ; + for (i = 0 ; i < n ; ++i) + { + if (!Utils::BitRead(V, i)) + continue ; + + Utils::BitFlip(V, i) ; + j = Pi[i] ; + while (Utils::BitRead(V, j)) + { + if (Utils::BitRead(B, j)) + { + // Since B[j]==1, use inclusive==0 automatically subtract the rank value by 1 + _S.Write( _B.Rank(1, j, 0), i) ; + i = j ; + } + Utils::BitFlip(V, j) ; + j = Pi[j] ; + } + if (Utils::BitRead(B, j)) + { + _S.Write( _B.Rank(1, j, 0), i) ; + } + i = j ; + } + + free(B) ; + free(V) ; + } + + //@return: Pi^{-1}[i] + size_t Query(size_t *Pi, size_t i) + { + size_t j = i ; + bool jumped = false ; + while (Pi[j] != i) + { + if (!jumped && _B.Access(j)) + { + j = _S.Read(_B.Rank(1, j, 0)) ; + jumped = true ; + } + else + j = Pi[j] ; + } + return j ; + } +} ; +} + +#endif diff --git a/compactds/DS_Parenthesis.hpp b/compactds/DS_Parenthesis.hpp new file mode 100644 index 0000000..8429eb3 --- /dev/null +++ b/compactds/DS_Parenthesis.hpp @@ -0,0 +1,150 @@ +#ifndef _MOURISL_COMPACTDS_PARENTHESIS +#define _MOURISL_COMPACTDS_PARENTHESIS + +#include "Utils.hpp" +#include "DS_RangeMinMaxTree.hpp" +#include "DS_PatternRankSelect.hpp" + +namespace compactds { +class DS_Parenthesis +{ +private: + DS_RangeMinMaxTree _rmmTree ; + DS_PatternRankSelect _patRS ; + + void GenerateRandomBalanceParenthesisSegment(WORD *B, size_t n, size_t i, size_t j) + { + if (j == i + 1) + { + Utils::BitsWrite(B, i, j, 2) ; // write binary 10 + return ; + } + else if (j <= i) + { + return ; + } + + size_t split = i + rand() % (j - i + 1) ; + while ((split - i + 1) % 2 == 1 ) + split = i + rand() % (j - i + 1) ; + + Utils::BitSet(B, i) ; + Utils::BitClear(B, j) ; + if (split == i || split == j) + GenerateRandomBalanceParenthesisSegment(B, n, i + 1, j - 1) ; + else + { + Utils::BitClear(B, split) ; + Utils::BitSet(B, split + 1) ; + GenerateRandomBalanceParenthesisSegment(B, n, i + 1, split - 1) ; + GenerateRandomBalanceParenthesisSegment(B, n, split + 2, j - 1) ; + } + } + +public: + DS_Parenthesis() {} + ~DS_Parenthesis() {} + + void Free() + { + _rmmTree.Free() ; + _patRS.Free() ; + } + + void SetRmmTreeBlockSize(size_t b) + { + _rmmTree.SetBlockSize(b) ; + } + + size_t GetSpace(bool inclusive = true) + { + return _rmmTree.GetSpace(false) + (inclusive ? sizeof(*this) : 0) ; + + } + + void Init(const WORD *B, size_t n, WORD pat, int patLen) + { + _rmmTree.Init(B, n) ; + if (patLen > 0) + _patRS.Init(B, n, pat, patLen) ; + } + + // Expose the internal rmmTree. + const DS_RangeMinMaxTree& GetRmmTree() const + { + return _rmmTree ; + } + + size_t Close(size_t i, const WORD *B, size_t n) const + { + // Notice that our FwdSearch include the effect of i, so the d is slightly different than the textbook. + return _rmmTree.FwdSearch(i, 0, B, n) ; + } + + size_t Open(size_t i, const WORD *B, size_t n) const + { + return _rmmTree.BwdSearch(i, 0, B, n) ; + } + + size_t Enclose(size_t i, const WORD *B, size_t n) const + { + return _rmmTree.BwdSearch(i, -1 - Utils::BitRead(B, i), B, n) ; + } + + bool IsBalance(const WORD *B, size_t n) const + { + size_t i ; + int64_t excess = 0 ; + for (i = 0 ; i < n ; ++i) + { + excess += (2 * Utils::BitRead(B, i) - 1 ) ; + if (excess < 0) + return false ; + } + return true ; + } + + size_t PatternRank(size_t i, const WORD *B, size_t n, int inclusive = 1) const + { + return _patRS.Rank(i, B, n, inclusive) ; + } + + size_t PatternSelect(size_t i, const WORD *B, size_t n) const + { + return _patRS.Select(i, B, n) ; + } + + void GenerateRandomBalanceParenthesis(WORD *B, size_t n, int seed = 17) + { + srand(seed) ; + GenerateRandomBalanceParenthesisSegment(B, n, 0, n - 1) ; + } + + void Print(FILE *fp, const WORD *B, size_t n) + { + size_t i ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + fprintf(fp, "(") ; + else + fprintf(fp, ")") ; + } + fprintf(fp, "\n") ; + } + + void Save(FILE *fp) + { + _rmmTree.Save(fp) ; + _patRS.Save(fp) ; + } + + void Load(FILE *fp) + { + _rmmTree.Load(fp) ; + _patRS.Load(fp) ; + } +} ; +} + +#endif diff --git a/compactds/DS_PatternRankSelect.hpp b/compactds/DS_PatternRankSelect.hpp new file mode 100644 index 0000000..2e87782 --- /dev/null +++ b/compactds/DS_PatternRankSelect.hpp @@ -0,0 +1,272 @@ +#ifndef _MOURISL_COMPACTDS_DS_PATTERN_RANK_SELECT +#define _MOURISL_COMPACTDS_DS_PATTERN_RANK_SELECT + +// Binary search based method to calculate for pattern (not bit, but several bits). +// The tree structure looks like range min max tree, where we pre-record the information into blocks. +#include "Utils.hpp" + +namespace compactds { +class DS_PatternRankSelect +{ +private: + size_t _r ; // number of regions + size_t _b ; // block size + size_t _height ; + size_t *_counts ; // pattern count in each tree node + WORD _pat ; + int _patLen ; // pat is at most 64bits + size_t _space ; + + // Maps leaf id k to the tree idx + size_t LeafNum(size_t k) const + { + const size_t width = 1ull<<_height ; + if (k < 2 * _r - width) + return width - 1 + k ; + else + return width - 1 - _r + k ; + } + + // Maps tree index to leaf index + size_t NumLeaf(size_t v) const + { + const size_t width = 1ull<<_height ; + if (v >= width - 1) + return v - width + 1 ; + else + return v - width + 1 + _r ; + } + + //@return: the 0-based level of tree index v located at + int GetLevel(size_t v) const + { + return Utils::CountBits(v + 1) - 1 ; + } + + // The max tree index that is at the same level as v + size_t LevelMaxNum(size_t v) const + { + return (1ull<<(GetLevel(v) + 1)) - 2 ; + } + + // The min tree index that is at the same level as v + size_t LevelMinNum(size_t v) const + { + return LevelMaxNum(v) / 2 ; + } + + // Find the tree index containing v, l levels higher + size_t PromoteLevel(size_t v, int l) const + { + return ((v+1) >> l) - 1; + } + + // Get the leftmost and rightmost leaf id (in leaf idx) + // Haven't validated yet. + size_t GetLeftmostLeaf(size_t v) const + { + size_t l = GetLevel(v) ; + size_t diff = v - LevelMinNum(v) ; + // Each node on this level covers chunk amount of leaves + size_t chunk = (1ull << (_height - l)) ; + size_t ret = (1 << _height) - 1 + diff * chunk ; + // Pretend this is a complete tree, and then adjust the extra leaves + if (ret > 2 * _r - 1) + ret /= 2 ; + return NumLeaf(ret) ; + } + + size_t GetRightmostLeaf(size_t v) const + { + size_t l = GetLevel(v) ; + size_t diff = v - LevelMinNum(v) ; + // Each node on this level covers chunk amount of leaves + size_t chunk = (1ull << (_height - l)) ; + size_t ret = (1 << _height) - 1 + (diff + 1) * chunk - 1 ; + // Pretend this is a complete tree, and then adjust the extra leaves + if (ret > 2 * _r - 1) + ret = (ret - 1) / 2 ; + return NumLeaf(ret) ; + } + +public: + DS_PatternRankSelect() + { + _space = _b = _r = 0 ; + _counts = NULL ; + } + + ~DS_PatternRankSelect() + { + Free() ; + } + + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + void Free() + { + if (_counts) + { + free(_counts) ; + _counts = NULL ; + _r = _b = _space = 0 ; + } + } + + void SetBlockSize(size_t b) + { + _b = b ; + } + + void Init(const WORD *B, size_t n, WORD pat, int patLen) + { + size_t i, j ; + _pat = pat ; + _patLen = patLen ; + + if ((int)_b <= patLen) + // Take the block size as word bits to maintain low space usage + _b = 16 * WORDBITS ; + + _r = DIV_CEIL(n, _b) ; + _height = Utils::Log2Ceil(_r) ; + + _counts = (size_t *)malloc(sizeof(size_t) * (2 * _r - 1)) ; + _space = sizeof(*_counts) * (2*_r - 1) ; + + // Fill the leaves + for (i = 0 ; i < n ; i += _b) + { + size_t l = i / _b ; // leaf id + size_t ltid = LeafNum(l) ; + size_t count = 0 ; + // the count include the positions that may stretch to the next block + for (j = i ; j + _patLen - 1 < n && j < i + _b ; ++j) + { + WORD w = Utils::BitsRead(B, j, j + _patLen - 1) ; + if (w == pat) + ++count ; + } + _counts[ltid] = count ; + } + + // Fill the internal nodes + for (i = _r - 2 ; i < _r ; --i) + { + size_t count = _counts[2 * i + 1] ; + if (2 * i + 2 < (2 * _r - 1)) + count += _counts[2 * i + 2] ; + _counts[i] = count ; + } + } + + size_t Rank(size_t i, const WORD *B, size_t n, int inclusive = 1) const + { + if (!inclusive) + { + if (i == 0) + return 0 ; + else + --i ; + } + size_t j ; + size_t lid = i / _b ; + size_t rank = 0 ; + size_t v = 0 ; + + while (2 * v + 2 < 2 * _r - 1) + { + if (lid <= GetRightmostLeaf(2 * v + 1)) + { + v = 2 * v + 1 ; + } + else + { + rank += _counts[2 * v + 1] ; + v = 2 * v + 2 ; + } + } + + for (j = i/_b * _b ; j <= i && j + _patLen - 1 < n; ++j) + { + // TODO: this part could be optimized by read in one WORD and + // use left shift+mask to search the pattern. + WORD w = Utils::BitsRead(B, j, j + _patLen - 1) ; + if (w == _pat) + ++rank ; + } + return rank ; + } + + size_t Select(size_t i, const WORD *B, size_t n) const + { + size_t j ; + size_t v = 0 ; + size_t count = 0 ; + while (2 * v + 2 < 2 * _r - 1) + { + if (_counts[2 * v + 1] + count >= i) + v = 2 * v + 1 ; + else + { + count += _counts[2 * v + 1] ; + v = 2 * v + 2 ; + } + } + + size_t k = NumLeaf(v) ; + for (j = k * _b ; j < (k + 1) * _b ; ++j) + { + WORD w = Utils::BitsRead(B, j, j + _patLen - 1) ; + if (w == _pat) + ++count ; + if (count == i) + return j ; + } + return 0 ; + } + + bool IsPattern(size_t i, const WORD *B, size_t n) const + { + if (i + _patLen - 1 >= n) + return false ; + return (Utils::BitsRead(B, i, i + _patLen - 1) == _pat) ; + + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _r) ; + SAVE_VAR(fp, _b) ; + SAVE_VAR(fp, _height) ; + SAVE_VAR(fp, _pat) ; + SAVE_VAR(fp, _patLen) ; + + if (_r > 0) + SAVE_ARR(fp, _counts, 2 * _r - 1) ; + } + + void Load(FILE *fp) + { + Free() ; + + LOAD_VAR(fp, _r) ; + LOAD_VAR(fp, _b) ; + LOAD_VAR(fp, _height) ; + LOAD_VAR(fp, _pat) ; + LOAD_VAR(fp, _patLen) ; + + if (_r > 0) + { + _counts = (size_t *)malloc(sizeof(size_t) * (2 * _r - 1)) ; + LOAD_ARR(fp, _counts, 2 * _r - 1) ; + _space = sizeof(*_counts) * (2*_r - 1) ; + } + } +} ; + +} +#endif diff --git a/compactds/DS_RangeMinMaxTree.hpp b/compactds/DS_RangeMinMaxTree.hpp new file mode 100644 index 0000000..92e1c03 --- /dev/null +++ b/compactds/DS_RangeMinMaxTree.hpp @@ -0,0 +1,920 @@ +#ifndef _MOURISL_COMPACTDS_DS_RANGEMINMMAXTREE +#define _MOURISL_COMPACTDS_DS_RANGEMINMMAXTREE + +// Based on section 7.1.1 +// Handles the excessive information in bit vector +#include "Utils.hpp" + +// Note that the excess can be negative, so we use int64_t in many places. +// The input B is like: +// 1 1 1 0 0 0 +// The excess tracking will be +// 0 1 2 3 2 1 0 +// (The search for index i is always with respect to between i-th B and (i+1)-th B.) +// The search for index i is always inclusive. +// The forward search will return the effect after j-th B. +// The backward search will return the effect before j-th B. +namespace compactds { +struct _rangeMinMaxTreeNode +{ + // Each number is within the block, so the value range should be small + // Assume block size is less than 2^15 + int16_t e ; // excess with respect to the beginning of the region + int16_t min ; // min e + int16_t max ; // max e + int16_t n ; // number of times hit min + + void Merge(const struct _rangeMinMaxTreeNode &b) + { + if (e + b.min < min) + { + min = e + b.min ; + n = b.n ; + } + else if (e + b.min == min) + n += b.n ; + + if (e + b.max > max) + max = e + b.max ; + e += b.e ; + } + + // Wrapper if we need the information from right to left (direction<0) + int16_t RevE() + { + return -e ; + } + + int16_t RevMin() + { + return min <= 0 ? (min - e) : -e ; + } + + int16_t RevMax() + { + return max >= 0 ? (max - e) : -e ; + } +} ; + +class DS_RangeMinMaxTree +{ +private: + size_t _space ; + size_t _r ; // number of regions + size_t _b ; // block size + size_t _n ; + size_t _height ; + struct _rangeMinMaxTreeNode *_tree ; + int _cwidth ; // chunk size + struct _rangeMinMaxTreeNode *_C ; // precomputed chunk + + // Maps leaf id k to the tree idx + size_t LeafNum(size_t k) const + { + const size_t width = 1ull<<_height ; + if (k < 2 * _r - width) + return width - 1 + k ; + else + return width - 1 - _r + k ; + } + + // Maps tree index to leaf index + size_t NumLeaf(size_t v) const + { + const size_t width = 1ull<<_height ; + if (v >= width - 1) + return v - width + 1 ; + else + return v - width + 1 + _r ; + } + + //@return: the 0-based level of tree index v located at + int GetLevel(size_t v) const + { + return Utils::CountBits(v + 1) - 1 ; + } + + // The max tree index that is at the same level as v + size_t LevelMaxNum(size_t v) const + { + size_t ret = (1ull<<(GetLevel(v) + 1)) - 2 ; + //if (ret >= 2*_r-1) + // ret = 2*_r - 2 ; + return ret ; + } + + // The min tree index that is at the same level as v + size_t LevelMinNum(size_t v) const + { + return LevelMaxNum(v) / 2 ; + } + + // Find the tree index containing v, l levels higher + size_t PromoteLevel(size_t v, int l) const + { + return ((v+1) >> l) - 1; + } + + // Update extreme value (min, max) + int64_t UpdateExtreme(int64_t e, int64_t x, int type) const + { + if ((type == 0 && x < e) + || (type == 1 && x > e)) + return x ; + return e ; + } + + void InitPrecomputedChunks() + { + size_t i ; + int j ; + // Precomputed block + _C = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_C) * (1<<_cwidth)) ; + _space += sizeof(*_C) * (1<<_cwidth) ; + + // Fill precomputed block + for (i = 0 ; i < (1ull<<_cwidth) ; ++i) + { + int16_t excess = 0 ; + int16_t min = 2, max = -2, minCnt = 0 ; + for (j = 0 ; j < _cwidth ; ++j) + { + excess += (2 * ((i >> j) & 1) - 1) ; + + if (excess < min || minCnt == 0) + { + min = excess ; + minCnt = 1 ; + } + else if (excess == min) + ++minCnt ; + + if (excess > max) + max = excess ; + } + _C[i].e = excess ; + _C[i].min = min ; + _C[i].max = max ; + _C[i].n = minCnt ; + } + } + + // Search excess difference after i inside the block containing i (block size is _b) + // @return: d, or the excess from i to the end of the block when no match. + // retj: the coordinate of the matched position, or just pass the block + int64_t FwdBlock(size_t i, int64_t d, size_t &retj, const WORD *B) const + { + size_t j ; + + size_t f = i / _cwidth ; // current chunk. f:from; t: to. + size_t t = ((i / _b + 1) * _b ) / _cwidth - 1 ; // last chunk in the block + + int64_t excess = 0 ; + // search the current chunk + for (j = i ; j < f * _cwidth + _cwidth && j < _n ; ++j) + { + excess += (2 * Utils::BitRead(B, j) - 1) ; + + if (excess == d) + { + retj = j ; + return d ; + } + } + + // search the remaining chunks in the block + size_t p ; + for (p = f + 1 ; p <= t ; ++p) + { + WORD chunk = 0 ; + if (p * _cwidth >= _n) + break ; + + if ((p + 1) * _cwidth - 1 <= _n - 1) + chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth -1) ; + else // the chunk will be padding with 0's (\)), which may cause too small m + // but the border case will be handled after searching the hit chunk. + chunk = Utils::BitsRead(B, p * _cwidth, _n - 1) ; + + if ((d <= 0 && excess > d && excess + _C[chunk].min <= d) + || (d >= 0 && excess < d && excess + _C[chunk].max >= d)) + break ; + + excess += _C[chunk].e ; + } + + // Could not find it in current block + if (p > t) + { + retj = _cwidth * (t + 1) ; + return excess ; + } + if (p * _cwidth >= _n) + { + retj = _n ; + return excess ; + } + + // Search the hit chunk + for (j = p * _cwidth ; j < p * _cwidth + _cwidth && j < _n ; ++j) + { + excess += (2 * Utils::BitRead(B, j) - 1) ; + + if (excess == d) + { + retj = j ; + return d ; + } + } + + // Can only reach here in border case + retj = _n ; + return excess ; + } + + // Similarly to FwdBlack, but search backwards (to left) + int64_t BwdBlock(size_t i, int64_t d, size_t &retj, const WORD *B) const + { + size_t j ; + + size_t f = i / _cwidth ; // current chunk + size_t t = ((i / _b) * _b ) / _cwidth ; // first chunk in the block + + int64_t excess = 0 ; + // search the current chunk + for (j = i ; j >= f * _cwidth && j < _n ; --j) + { + excess -= (2 * Utils::BitRead(B, j) - 1) ; + + if (excess == d) + { + retj = j ; + return d ; + } + } + + //search the remaining chunks in the block + size_t p ; + for (p = f - 1 ; p >= t && p < _n ; --p) + { + WORD chunk = 0 ; + + chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth -1) ; + + if ((d <= 0 && excess > d && excess + _C[chunk].RevMin() <= d) + || (d >= 0 && excess < d && excess + _C[chunk].RevMax() >= d)) + break ; + + excess += _C[chunk].RevE() ; + } + + // Could not find it in current block + if (p < t) + { + retj = _cwidth * t - 1 ; + return excess ; + } + if (p >= _n) // not exist + { + retj = _n ; + return excess ; + } + + // Search the hit chunk + for (j = p * _cwidth + _cwidth - 1 ; j >= p * _cwidth && j < _n ; --j) + { + excess -= (2 * Utils::BitRead(B, j) - 1) ; + if (excess == d) + { + retj = j ; + return d ; + } + } + + // Can only reach here in border case + retj = _n ; + return excess ; + } + + // Scanning a block for min/max[i,j] + // Assumes i and j are in the same block + // As other block searches, it returns the excess in this block, + // the extreme value is passed through the reference + int64_t ExtremeBlock(size_t i, size_t j, int type, int64_t &extreme, const WORD *B) const + { + size_t p ; // the index for loop + + size_t f = i / _cwidth ; // first chunk + size_t t = j / _cwidth ; // last chunk + + int64_t excess = 0 ; + if (type == 0) + extreme = 2 ; + else + extreme = -2 ; + for (p = i ; p < f * _cwidth + _cwidth && p <= j && p < _n ; ++p) + { + excess += (2 * Utils::BitRead(B, p) - 1) ; + extreme = UpdateExtreme(extreme, excess, type) ; + } + if (f == t) + return excess ; + + // Since we search to t-1, there is no need to worry about reading after _n + for (p = f + 1 ; p <= t - 1 ; ++p) + { + WORD chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth - 1) ; + extreme = UpdateExtreme(extreme, excess + + (type == 0 ? _C[chunk].min : _C[chunk].max), type) ; + excess += _C[chunk].e ; + } + + // last chunk + for (p = t * _cwidth ; p <= j && p <= _n ; ++p) + { + excess += (2 * Utils::BitRead(B, p) - 1) ; + extreme = UpdateExtreme(extreme, excess, type) ; + } + + return excess ; + } + + // Given the global min value between [i,j], try to find its count + // @return: the excess after j or the block if j is out of the bo + int64_t MinCountBlock(size_t i, size_t j, int64_t min, size_t &minCnt, const WORD *B) const + { + size_t p ; // the index for loop + + size_t f = i / _cwidth ; // first chunk + size_t t = j / _cwidth ; // last chunk + + int64_t excess = 0 ; + minCnt = 0 ; + + for (p = i ; p < f * _cwidth + _cwidth && p <= j && p < _n ; ++p) + { + excess += (2 * Utils::BitRead(B, p) - 1) ; + if (excess == min) + ++minCnt ; + } + if (f == t) + return excess ; + + // Since we search to t-1, there is no need to worry about reading after _n + for (p = f + 1 ; p <= t - 1 ; ++p) + { + WORD chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth - 1) ; + if (excess + _C[chunk].min == min) + minCnt += _C[chunk].n ; + excess += _C[chunk].e ; + } + + // last chunk + for (p = t * _cwidth ; p <= j && p <= _n ; ++p) + { + excess += (2 * Utils::BitRead(B, p) - 1) ; + if (excess == min) + ++minCnt ; + } + + return excess ; + } + + // Return the excess. The coordinate of the k-th min is returned through selectk. set _n if not found + int64_t MinSelectBlock(size_t i, size_t j, int64_t min, size_t kthMin, size_t &selectk, size_t &minCnt, const WORD *B) const + { + size_t p ; // the index for loop + + size_t f = i / _cwidth ; // first chunk + size_t t = j / _cwidth ; // last chunk + + int64_t excess = 0 ; + minCnt = 0 ; + + for (p = i ; p < f * _cwidth + _cwidth && p <= j && p < _n ; ++p) + { + excess += (2 * Utils::BitRead(B, p) - 1) ; + if (excess == min) + { + ++minCnt ; + if (minCnt == kthMin) + { + selectk = p ; + return excess ; + } + } + } + if (f == t) + { + selectk = _n ; + return excess ; + } + + // Since we search to t-1, there is no need to worry about reading after _n + for (p = f + 1 ; p <= t - 1 ; ++p) + { + WORD chunk = Utils::BitsRead(B, p * _cwidth, (p+1) * _cwidth - 1) ; + if (excess + _C[chunk].min == min) + { + if (kthMin <= minCnt + _C[chunk].n) + { + size_t chunki ; + for (chunki = p * _cwidth ; chunki < (p + 1) * _cwidth ; ++chunki) + { + excess += (2 * Utils::BitRead(B, chunki) - 1) ; + if (excess == min) + { + ++minCnt ; + if (minCnt == kthMin) + { + selectk = chunki ; + return excess ; + } + } + } + } + minCnt += _C[chunk].n ; + } + excess += _C[chunk].e ; + } + + // last chunk + for (p = t * _cwidth ; p <= j && p <= _n ; ++p) + { + excess += (2 * Utils::BitRead(B, p) - 1) ; + if (excess == min) + { + ++minCnt ; + if (minCnt == kthMin) + { + selectk = p ; + return excess ; + } + } + } + + return excess ; + } + +public: + DS_RangeMinMaxTree() + { + _space = _b = 0 ; + _cwidth = 8 ; + _tree = _C = NULL ; + } + + ~DS_RangeMinMaxTree() + { + Free() ; + } + + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + void Free() + { + if (_tree != 0) + { + free(_tree) ; + free(_C) ; + _tree = NULL ; + _C = NULL ; + } + } + + void SetBlockSize(size_t b) + { + _b = b ; + } + + // s: some special to track the count. + // slen: length of the special character + void Init(const WORD *B, size_t n) + { + size_t i, j ; + if (_b <= 8) // block size has to be larger than a byte, and should be power of 2 + _b = 1024 ; + + _n = n ; + _r = DIV_CEIL(n, _b) ; + _height = Utils::Log2Ceil(_r) ; + _tree = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_tree) * (2 * _r - 1)) ; + _space += sizeof(*_tree) * (2*_r-1) ; + + InitPrecomputedChunks() ; + + // Initialize the leafs + for (i = 0 ; i < n ; i += _b) + { + size_t treeIdx = LeafNum(i / _b) ; + _tree[treeIdx].e = 0 ; + _tree[treeIdx].min = 2 ; + _tree[treeIdx].max = -2 ; + _tree[treeIdx].n = 0 ; + for (j = i ; j < n && j < i + _b ; j += _cwidth) + { + uint64_t chunk = Utils::BitsRead(B, j, j + _cwidth - 1) ; + _tree[treeIdx].Merge( _C[chunk] ) ; + } + } + + // Initialize internal nodes + for (i = _r - 2 ; i < _r ; --i) + { + _tree[i] = _tree[2 * i + 1] ; + if (2 * i + 2 < (2 * _r - 1)) + _tree[i].Merge(_tree[2 * i + 2]) ; + } + } + + //It's a bit different from the textbook that the FwdSearch + // in our implementation include the effect from i. + // This makes 0-based indexing have better definition. + //@return: the index j >= i that has excess different d + // return _n if not found + size_t FwdSearch(size_t i, int64_t d, const WORD *B, size_t n) const + { + size_t j ; + int64_t excess ; + excess = FwdBlock(i, d, j, B) ; + + if (excess == d) + return j ; + if (j == _n) + return _n ; + // Not in current block, so we need to search the tree to find the block + // v is the tree node index. + size_t v = LeafNum(i / _b) ; + + // Go up the tree first + // After the iterations, v+1 should be the node containing the target j. + // The test for v+1>=2*_r-1 is for the case where the last level is not full (leaf level) + // if it is rightmost leaf on the last level, we can directly go to parent + while (v + 1 <= LevelMaxNum(v) && + (v + 1 >= 2*_r-1 || (d <= 0 && excess > d && excess + _tree[v + 1].min > d) + || (d >= 0 && excess < d && excess + _tree[v + 1].max < d) )) + // next node block is not enough + // The next node not necessarily the brother node, but the covered region is adjacent + // based on the numbering system. + { + if ((v & 1) == 1) // v is left child. Note that our index is 0-based, so it is not the 2*xxx relation in 1-based index. + excess += _tree[v + 1].e ; + v = (v-1) / 2 ; // parent node + } + + if (v == LevelMaxNum(v)) // Not found. v is the rightmost block on the level + return _n ; + + // Go down the tree to locate the block + ++v ; + while (2 * v + 2 < 2 * _r - 1) + { + if ((d <= 0 && excess > d && excess + _tree[2 * v + 1].min <= d) + || (d >= 0 && excess < d && excess + _tree[2 * v + 1].max >= d)) + v = 2 * v + 1 ; + else + { + excess += _tree[2 * v + 1].e ; + v = 2 * v + 2 ; + } + } + + // The else branch above may go beyond the number of blocks + if (v >= 2 * _r - 1) + return _n ; + + // Search the target leaf block + // The FwdBlock searches things after the index, so we need put -1 in + // NumLeaf(v) * _b + excess = FwdBlock(NumLeaf(v) * _b, d - excess, j, B) ; + return j ; + } + + //@return: the index j <= i that has excess different d comparing with i from right to left + // return _n if not found + size_t BwdSearch(size_t i, int64_t d, const WORD *B, size_t n) const + { + size_t j ; + int64_t excess ; + /*if (i == 0) + { + // d == 1, b0 == 0 + // or d == -1, b0==1 + if (d == -2 * Utils::BitRead(B, 0) + 1 ) + return 0 ; + return _n ; + }*/ + + excess = BwdBlock(i, d, j, B) ; + if (excess == d) + return j ; + if (j == _n) + return _n ; + size_t v = LeafNum(i / _b) ; + + // Go up the tree first + // After the iterations, v-1 should be the node containing the target j. + while (v != 0 && v - 1 >= LevelMinNum(v) && + ((d <= 0 && excess > d && excess + _tree[v - 1].RevMin() > d) + || (d >= 0 && excess < d && excess + _tree[v - 1].RevMax() < d) )) + // next node block is not enough + // The next node not necessarily the brother node, but the covered region is adjacent + // based on the numbering system. + { + if ((v & 1) == 0) // v is right child. Note that our index is 0-based, so it is not the 2*xxx relation in 1-based index. + excess += _tree[v - 1].RevE() ; + v = (v-1) / 2 ; // parent node + } + + if (v == LevelMinNum(v)) // Not found. v is the leftmost block on the level + return _n ; + + // Go down the tree to locate the block + --v ; + while (2 * v + 2 < 2 * _r - 1) + { + if ((d <= 0 && excess > d && excess + _tree[2 * v + 2].RevMin() <= d) + || (d >= 0 && excess < d && excess + _tree[2 * v + 2].RevMax() >= d)) + v = 2 * v + 2 ; + else + { + excess += _tree[2 * v + 2].RevE() ; + v = 2 * v + 1 ; + } + } + + // The else branch above may go beyond the number of blocks + if (v >= 2 * _r - 1) + return _n ; + + + // Search the target leaf block + // BwdBlock is inclusive + excess = BwdBlock(NumLeaf(v) * _b + _b - 1, d - excess, j, B) ; + return j ; + } + + // type: 0-min, 1-max + //@return: the min/max value in B[i,j]: included the effects from B[i] and B[j] + int64_t ExtremeExcess(size_t i, size_t j, int type, const WORD *B, size_t n) const + { + int64_t extreme = 0 ; + int64_t excess = 0 ; + + excess = ExtremeBlock(i, MIN(j, (i / _b) * _b + _b - 1), type, extreme, B); + if (j/_b <= i / _b) // in the same block + return extreme ; + + //printf("%d %d: %d %d\n", i, j, extreme, excess) ; + // Search the tree + size_t v = LeafNum(i / _b) ; + size_t l = LeafNum(j / _b) ; + int levelv = GetLevel(v) ; + int levell = GetLevel(l) ; + + // Upward search + // v+1 > l: l is in the upper level + // or l is still to the right of v+1 + while (v + 1 > l || v+1 != PromoteLevel(l, levell - levelv)) + { + if ( (v & 1) == 1 && v+1 < 2*_r - 1) //left children + { + extreme = UpdateExtreme(extreme, excess + + (type == 0 ? _tree[v + 1].min : _tree[v + 1].max), type) ; + excess += _tree[v + 1].e ; + } + v = (v - 1) / 2 ; + --levelv ; + } + //printf("%d %d: %d %d %d\n", i, j, v, extreme, excess) ; + + // Downward search. Now l should be in v+1 + ++v ; + while (v < _r - 1) // internal nodes + { + if ((type == 0 && extreme <= excess + _tree[v].min) + || (type == 1 && extreme >= excess + _tree[v].max)) + return extreme ; + + if (2 * v + 1 != PromoteLevel(l, levell - (levelv + 1))) + { + extreme = UpdateExtreme(extreme, excess + + (type == 0 ? _tree[2*v + 1].min : _tree[2*v + 1].max), type) ; + excess += _tree[2*v + 1].e ; + v = 2 * v + 2 ; + } + else + v = 2 * v + 1 ; + ++levelv ; + } + //printf("%d %d: %d %d. %d %d\n", i, j, v, extreme, excess, _tree[v].min) ; + + if ((type == 0 && extreme <= excess + _tree[v].min) + || (type == 1 && extreme >= excess + _tree[v].max)) + return extreme ; + + // last block + int64_t lastExtreme = 0 ; + ExtremeBlock((j / _b) * _b, j, type, lastExtreme, B) ; + //printf("%d %d: %d vs %d %d\n", i, j, extreme, excess, lastExtreme) ; + + return UpdateExtreme(extreme, excess + lastExtreme, type) ; + } + + // Leftmost position of a minimum in excess(B, i, j) + size_t Rmq(size_t i, size_t j, const WORD *B, size_t n) const + { + int64_t min = ExtremeExcess(i, j, 0, B, n) ; + return FwdSearch(i, min, B, n) ; + } + + // Leftmost position of a maximum in excess(B, i, j) + size_t RMq(size_t i, size_t j, const WORD *B, size_t n) const + { + int64_t max = ExtremeExcess(i, j, 1, B, n) ; + return FwdSearch(i, max, B, n) ; + } + + // Need .maxn in node structure to support maxcount but not implemented now, + // depends on future application to decide whether implement this feature. + size_t MinCount(size_t i, size_t j, const WORD *B, size_t n) const + { + // The min in this whole range + int64_t min = ExtremeExcess(i, j, 0, B, n) ; + + // Get first block + size_t minCnt = 0 ; + int64_t excess = 0 ; + + excess = MinCountBlock(i, MIN(j, (i / _b) * _b + _b - 1), min, minCnt, B); + if (j/_b <= i / _b) // in the same block + return minCnt ; + //printf("%d: %d %d\n", i, min, minCnt) ; + // Search the tree + size_t v = LeafNum(i / _b) ; + size_t l = LeafNum(j / _b) ; + int levelv = GetLevel(v) ; + int levell = GetLevel(l) ; + + // Upward search + // v+1 > l: l is in the upper level + // or l is still to the right of v+1 + while (v + 1 > l || v+1 != PromoteLevel(l, levell - levelv)) + { + if ( (v & 1) == 1 && v+1 < 2*_r - 1) //left children + { + if (excess + _tree[v + 1].min == min) + minCnt += _tree[v + 1].n ; + excess += _tree[v + 1].e ; + } + v = (v - 1) / 2 ; + --levelv ; + } + //printf("%d: %d %d\n", i, v, excess) ; + // Downward search. Now l should be in v+1 + ++v ; + while (v < _r - 1) // internal nodes + { + if (min < excess + _tree[v].min) + return minCnt ; + + if (2 * v + 1 != PromoteLevel(l, levell - (levelv + 1))) + { + if (excess + _tree[2*v + 1].min == min) + minCnt += _tree[2*v + 1].n ; + excess += _tree[2*v + 1].e ; + v = 2 * v + 2 ; + } + else + v = 2 * v + 1 ; + ++levelv ; + } + + //printf("%d: %d %d. %d %d %d\n", i, min, minCnt, excess, v, _tree[v].min) ; + if (min < excess + _tree[v].min) + return minCnt ; + + // last block + size_t lastMinCnt = 0 ; + // Notice the we need to use min-excess here to adjust the excess so far. + MinCountBlock((j / _b) * _b, j, min - excess, lastMinCnt, B) ; + + //printf("%d: %d %d %d\n", i, min, minCnt, lastMinCnt) ; + return minCnt + lastMinCnt ; + } + + // Select the t-th (1-based) minimum element in B[i..j] + size_t MinSelect(size_t i, size_t j, size_t t, const WORD *B, size_t n) const + { + // The min in this whole range + int64_t min = ExtremeExcess(i, j, 0, B, n) ; + + // Get first block + size_t minCnt = 0 ; + int64_t excess = 0 ; + size_t ret = _n ; + + excess = MinSelectBlock(i, MIN(j, (i / _b) * _b + _b - 1), min, t, ret, minCnt, B); + //printf("%d: %d %d\n", i, ret, minCnt) ; + if (j/_b <= i / _b // in the same block + || ret < _n ) // already found + return ret ; + + // Search the tree + size_t v = LeafNum(i / _b) ; + size_t l = LeafNum(j / _b) ; + int levelv = GetLevel(v) ; + int levell = GetLevel(l) ; + + //printf("%d: %d %d. %d %d\n", i, min, minCnt, v, _r) ; + // Upward search + // v+1 > l: l is in the upper level + // or l is still to the right of v+1 + while (v + 1 > l || v+1 != PromoteLevel(l, levell - levelv)) + { + if (v + 1 < 2 * _r - 1 && excess + _tree[v + 1].min == min + && minCnt + _tree[v + 1].n >= t) + break ; + + if ( (v & 1) == 1 && v+1 < 2*_r - 1) //left children + { + if (excess + _tree[v + 1].min == min) + minCnt += _tree[v + 1].n ; + excess += _tree[v + 1].e ; + } + v = (v - 1) / 2 ; + --levelv ; + } + + if (v == LevelMaxNum(v)) // Not found. v is the rightmost block on the level + return _n ; + + //printf("%d: %d %d %d\n", i, v, excess, minCnt) ; + // Downward search. + ++v ; + while (v < _r - 1) // internal nodes + { + if (min < excess + _tree[v].min) + return ret ; + + if ( 2 * v + 1 != PromoteLevel(l, levell - (levelv + 1)) //j is in the right chilad + && (excess + _tree[2 * v + 1].min != min + || minCnt + _tree[2 * v+1].n < t)) // left child could not reach t. + { + if (excess + _tree[2*v + 1].min == min) + minCnt += _tree[2*v + 1].n ; + excess += _tree[2*v + 1].e ; + v = 2 * v + 2 ; + } + else + v = 2 * v + 1 ; + + ++levelv ; + } + + //printf("%d: %d %d. %d %d %d\n", i, min, minCnt, excess, v, _tree[v].min) ; + if (min < excess + _tree[v].min) + return _n ; + + // last block + size_t lastMinCnt = 0 ; + // Notice the we need to use min-excess here to adjust the excess so far. + v = NumLeaf(v) ; + MinSelectBlock(v * _b, MIN(j, (v+1) * _b - 1), min - excess, t - minCnt, ret, lastMinCnt, B) ; + + //printf("%d: %d %d %d %d %d\n", i, v, min, minCnt, lastMinCnt, ret) ; + return ret ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _r) ; + SAVE_VAR(fp, _b) ; + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _height) ; + SAVE_VAR(fp, _cwidth) ; + SAVE_ARR(fp, _tree, 2 * _r - 1) ; + SAVE_ARR(fp, _C, 1 << _cwidth) ; + } + + void Load(FILE *fp) + { + Free() ; + LOAD_VAR(fp, _r) ; + LOAD_VAR(fp, _b) ; + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _height) ; + LOAD_VAR(fp, _cwidth) ; + _tree = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_tree) * (2 * _r - 1)) ; + LOAD_ARR(fp, _tree, 2 * _r - 1) ; + _C = (struct _rangeMinMaxTreeNode *)malloc(sizeof(*_C) * (1<<_cwidth)) ; + LOAD_ARR(fp, _C, 1 << _cwidth) ; + _space = sizeof(struct _rangeMinMaxTreeNode) * (2 * _r - 1 + (1<<_cwidth)) ; + } +} ; +} + +#endif diff --git a/compactds/DS_Rank.hpp b/compactds/DS_Rank.hpp new file mode 100644 index 0000000..dbe17ff --- /dev/null +++ b/compactds/DS_Rank.hpp @@ -0,0 +1,298 @@ +#ifndef _MOURISL_COMPACTDS_DS_RANK +#define _MOURISL_COMPACTDS_DS_RANK + +#include "Utils.hpp" +#include "FixedSizeElemArray.hpp" + +// The standalone data structe for rank query on a plain bitvector +// Time complexity: constant time +// Extra space complexity (bits): n/b + n/w*log(bw) +namespace compactds { +class DS_Rank +{ +private: + uint64_t *_R ; // the partial sum of 1s for blocks of the bit vector (right exclusive) + FixedSizeElemArray _subR ; // the partial sum within each block for constant access + int _b ; // block size, with respective to the word. + int _bshift ; // the number of bits-1 of b + size_t _wordCnt ; + size_t _space ; +public: + DS_Rank() + { + _R = NULL ; + _b = _space = 0 ; + } + + DS_Rank(int blockSize, const WORD *B, const int &n) + { + Init(blockSize, B, n) ; + } + + ~DS_Rank() { Free() ; } + + void Free() + { + if (_R != NULL) + { + free(_R) ; + _R = NULL ; + } + _b = 0 ; + } + + size_t GetSpace() { return _space + sizeof(*this); } + + // blockSize is the number of WORDs for each R + void Init(int blockSize, const WORD *B, const size_t &n) + { + size_t i ; + _b = blockSize ; + if (_b <= 0) + _b = WORDBITS ; + + i = _b >> 1 ; + for (_bshift = 0 ; i != 0 ; i >>= 1, ++_bshift) + ; + + _wordCnt = Utils::BitsToWords(n) ; + size_t blockCnt = DIV_CEIL(_wordCnt, _b) ; + _R = (uint64_t *)malloc(sizeof(uint64_t) * blockCnt) ; + _space += sizeof(uint64_t) * blockCnt ; + _subR.Malloc(Utils::Log2Ceil((_b-1)*WORDBITS), _wordCnt - blockCnt) ; // we don't need to store the first sub-block in each block + _space += _subR.GetSpace() - sizeof(_subR) ; + uint64_t onecntSum = 0 ; + size_t localOneCntSum = 0 ; + for (i = 0 ; i < _wordCnt ; ++i) + { + if (i % _b == 0) + { + _R[i/_b] = onecntSum ; + localOneCntSum = 0 ; + } + else + { + _subR.Write(i - i / _b - 1, localOneCntSum) ; + } + int onecnt = Utils::Popcount(B[i]) ; + onecntSum += onecnt ; + localOneCntSum += onecnt ; + } + } + + int GetBlockSize() const // unit in word + { + return _b ; + } + + int GetSubBlockSize() const + { + return WORDBITS ; + } + + uint64_t *GetR() const + { + return _R ; + } + + const FixedSizeElemArray *GetSubR() const + { + return &_subR ; + } + + + size_t Query(size_t i, const WORD *B, const size_t &n, int inclusive = 1) const + { + if (i >= n) + return Query(n - 1, B, n, inclusive) ; + + size_t wi = i >> WORDBITS_WIDTH ; + return _R[wi >> _bshift] + ((wi&(_b - 1)) ? _subR.Read(wi - (wi >> _bshift) - 1) : 0) + + Utils::Popcount(B[wi] & ((MASK(i&(WORDBITS - 1))<> 3) * 2 ; // region/block id + const size_t t = (wi & 7) - 1 ; // the offset in the subblock + return _R[ri] + ((_R[ri + 1]>> ((t + ((t>>60)&8))*9)) & 0x1ff) ; + } + + uint64_t *GetR() const + { + return _R ; + } + + // blockSize is the number of WORDs for each R + void Init(const WORD *B, const size_t &n) + { + size_t i ; + const int b = 8 ; // number of word in each block + const int subrWidth = Utils::Log2Ceil((b-1) * 64) ; // should equal to 9 + _wordCnt = Utils::BitsToWords(n) ; + size_t blockCnt = DIV_CEIL(_wordCnt, b) ; + _R = (uint64_t *)calloc(blockCnt * 2, sizeof(uint64_t)) ; + _space = sizeof(uint64_t) * blockCnt * 2 ; + uint64_t onecntSum = 0 ; + size_t localOneCntSum = 0 ; + for (i = 0 ; i < _wordCnt ; ++i) + { + size_t bi = i/b * 2 ; // block index + int br = i % b ; //remainder + if (br == 0) + { + _R[bi] = onecntSum ; + _R[bi + 1] = 0 ; + localOneCntSum = 0 ; + } + else + { + _R[bi + 1] |= (localOneCntSum << ((br - 1)*subrWidth)) ; + } + int onecnt = Utils::Popcount(B[i]) ; + onecntSum += onecnt ; + localOneCntSum += onecnt ; + } + // Fill in the remaining subr blocks + // so other module don't need to worry + // too much about boundary case + if ((i-1) % b > 0) + { + size_t bi = i/b * 2 ; // block index + for ( ; i % b ; ++i) + { + int br = i % b ; + _R[bi + 1] |= (localOneCntSum << ((br - 1)*subrWidth)) ; + } + } + } + + // read the si-th subblock in block bi + int DecodeSubR(size_t bi, size_t si) const + { + return (_R[2 * bi + 1] >> (si * 9)) & 0x1ff; + } + + size_t Query(size_t i, const WORD *B, const size_t &n, int inclusive = 1) const + { + if (i >= n) + return Query(n - 1, B, n, inclusive) ; + + const size_t wi = (i>>WORDBITS_WIDTH) ; // word id + const size_t ri = (wi >> 3) * 2 ; // region/block id + const size_t t = (wi & 7) - 1 ; // the offset in the subblock + // 0x1ff is the mask for 9 bit + // The ((t>>60)&8))*9) portion is to avoid branching when wi%8 == 0 + // In this case, t=0xffff.., and (t + ((t>>60)&8))*9) == 63 + // and the top bit of _R[ri+1] is 0, which makes the whole portion == 0 + // The implementation of inclusive also avoids branching using property that + // inclusive variable is binary. + return _R[ri] + ((_R[ri + 1]>> ((t + ((t>>60)&8))*9)) & 0x1ff) + + Utils::Popcount(B[wi] & ((MASK(i&(WORDBITS - 1))< _n) + return POSITIVE_INF ; + if (_n == 1) // i must == 1 here + return 1 ; + + size_t si = (i - 1) / _b ; + + size_t l, m, r ; // variables for binary search. They are coordinates on B + l = _S[type][si] ; + r = _S[type][si + 1] ; + if ((i - 1) % _b == 0) + return l ; + if (_speed == 1 || r - l < _longBlockLength) // r-l is more efficient than V.access. + { + if (_speed == 3) + { + // Adjust l, r using _miniblocks + size_t oldl = l ; + l = oldl + _Imini[type].Read((i - 1) / _minib) ; + if ((i - 1) % _minib + 1 < (unsigned int)(_b / _minib)) // only adjust r if it is not in the last _miniblock in current block + r = oldl + _Imini[type].Read((i - 1) / _minib + 1) ; + } + if (_speed <= 3) + { + --r ; + + // Locate the R + uint64_t *rankR = rank.GetR() ; // rankR is right open + int rankBlockSize = rank.GetBlockSize() ; // this block size is with respect to WORD + size_t rl, rr ; + size_t tmp ; + rl = l / (rankBlockSize * WORDBITS) ; + rr = r / (rankBlockSize * WORDBITS) ; + while (rl <= rr) + { + m = (rl + rr) / 2 ; + tmp = rankR[m] ; + if (type == 0) + tmp = m * rankBlockSize * WORDBITS - tmp ; + + if (tmp < i) + rl = m + 1 ; + else + rr = m - 1 ; // rankR[0]==0 makes sure m>=1 in the process + } + + // Locate the subR + size_t remaining ; + if (type == 1) + remaining = i - rankR[rr] ; + else + remaining = i - (rr * (rankBlockSize * WORDBITS) - rankR[rr]) ; + size_t subrl, subrr, fixedSubrl ; + int rankSubBlockSize = rank.GetSubBlockSize() ; // This block size is with respecto to bits + const FixedSizeElemArray &rankSubR = *(rank.GetSubR()) ; + + subrl = rr * (rankBlockSize * WORDBITS / rankSubBlockSize - 1) ; // the first subr has offset 0, so we don't store them. + subrr = subrl + rankBlockSize * WORDBITS / rankSubBlockSize - 2 ; + if (subrr >= rankSubR.GetSize()) + subrr = rankSubR.GetSize() - 1 ; + bool inFirstSubBlock = false ; + if (rankSubR.GetSize() == 0 || (type == 1 && (uint32_t)rankSubR.Read(subrl) >= remaining) + || (type == 0 && rankSubBlockSize - (uint32_t)rankSubR.Read(subrl) >= remaining) + || subrl >= rankSubR.GetSize()) // The case that the last block has only one subblock, which will not be allocated + inFirstSubBlock = true ; + + fixedSubrl = subrl ; + if ( !inFirstSubBlock ) + { + while (subrl <= subrr) + { + m = (subrl + subrr) / 2 ; + tmp = rankSubR.Read(m) ; + if (type == 0) + tmp = (m - fixedSubrl + 1) * rankSubBlockSize - tmp ; // plus 1 here to incorporate the first sub block + if (tmp < remaining) + subrl = m + 1 ; + else + subrr = m - 1 ; // the in firstsubblock test makes sure this part won't under-flow + } + + if (type == 1) + remaining -= rankSubR.Read(subrr) ; + else + remaining -= ((subrr - fixedSubrl + 1) * rankSubBlockSize - rankSubR.Read(subrr)) ; + } + + // Processing the last WORD + size_t lastWi = 0 ; // index of the last word + WORD lastW = 0 ; + if (inFirstSubBlock) + lastWi = rr * rankBlockSize ; + else + lastWi = rr + (subrr + 1) * (rankSubBlockSize / WORDBITS) ; // here the rr is to compensate for the first subblock missed in every sub block + lastW = B[lastWi] ; + size_t j ; + + int sum = 0 ; + for (j = 0 ; j < WORDBITS ; j += _precomputeb) + { + WORD x = (lastW >> j) & MASK(_precomputeb) ; + int tmp = Utils::Popcount(x) ; + if (type == 0) + tmp = _precomputeb - tmp ; + if (sum + tmp >= (int)remaining) + { + return lastWi * WORDBITS + j + _precomputedShortMiniBlock[type].Read(x * _precomputebElem + remaining - sum - 1) ; + } + sum += tmp ; + } + return POSITIVE_INF ; // should not reach here. + } + else // _speed >= 4 + { + size_t skippedMiniBlocksInLong = _rankV[type].Query(si, _V[type], n, 0) * (_b/_minib) ; + size_t iMini = (i - 1) / _minib - skippedMiniBlocksInLong ; + if ( Utils::BitRead(_Vmini[type], iMini) ) + { + // long mini block + if ((i-1) % _minib == 0) + { + return l + _Imini[type].Read(iMini) ; + } + else + { + size_t iLongMini = _rankVmini[type].Query(iMini - skippedMiniBlocksInLong, + _Vmini[type], _VminiSize[type], 0) ; + //printf("%d\n", iLongMini * (_minib - 1) + (i - 1)%_minib - 1) ; + /*printf("b=%d _minib=%d. i=%d iMini=%d skippedMini=%d iLongMini=%d l=%d _Imini[iMini]=%d x=%d _Ilongmini[x]=%d. ret=%d\n", + b, _minib, i, iMini, skippedMiniBlocksInLong, + iLongMini, l, _Imini[type].Read(iMini), + iLongMini * (_minib - 1) + (i-1)%_minib - 1, _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1)%_minib - 1), + l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1)) ;*/ + return l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1) ; + } + } + else + { + // short mini block + size_t offset = l + _Imini[type].Read(iMini) ; + WORD localw = Utils::BitsRead(B, offset, offset + _longMiniBlockLength - 2) ; + return offset + _precomputedShortMiniBlock[type].Read(localw * _minib + (i-1)%_minib) ; + } + } + } + else + { + // long block with sparse 1's + size_t iI = (_rankV[type].Query(si, _V[type], n) - 1) * (_b - 1); // block index in I + //printf("long block %d %d %d %d %d. %d\n", i, b, si, Utils::BitRead(V[type], si), iI, _I[type].Read(iI + (i - 1)%b - 1)) ; + return _I[type].Read(iI + (i - 1)%_b - 1) ; + } + } + + // The select that handles both types (0 or 1). Optimized for rank9 + size_t GeneralQuery(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n, int type) const + { + if (i < 1 || i > _n) + return POSITIVE_INF ; + if (_n == 1) // i must == 1 here + return 1 ; + + size_t si = (i - 1) / _b ; + + size_t l, m, r ; // variables for binary search. They are coordinates on B + l = _S[type][si] ; + r = _S[type][si + 1] ; + if ((i - 1) % _b == 0) + return l ; + if (_speed == 1 || r - l < _longBlockLength) // r-l is more efficient than V.access. + { + if (_speed == 3) + { + // Adjust l, r using _miniblocks + size_t oldl = l ; + const int factor = _b / _minib ; // there are factor _miniblocks in each block + l = oldl + _Imini[type].Read((i - 1) / _minib) ; + if ( (int)((i - 1)/_minib) % factor + 1 < factor // only adjust r if it is not in the last _miniblock in current block + && r < _n // and the _miniblock is not the last _miniblock in the all the array + ) + r = oldl + _Imini[type].Read((i - 1) / _minib + 1) ; + } + if (_speed <= 3) + { + --r ; + + // Locate the R + uint64_t *rankR = rank.GetR() ; + const int rankBlockSize = rank.GetBlockSize() ; // this block size is with respect to WORD + size_t rl, rr ; + size_t tmp ; + rl = l / (rankBlockSize * WORDBITS) ; + rr = r / (rankBlockSize * WORDBITS) ; + while (rl <= rr) + { + m = (rl + rr) / 2 ; + tmp = rankR[m << 1] ; + if (type == 0) + tmp = m * rankBlockSize * WORDBITS - tmp ; + + if (tmp < i) + rl = m + 1 ; + else + rr = m - 1 ; // rankR[0]==0 makes sure m>=1 in the process + } + + // Locate the subR + size_t remaining ; + if (type == 1) + remaining = i - rankR[rr<<1] ; + else + remaining = i - (rr * (rankBlockSize * WORDBITS) - rankR[rr<<1]) ; + if (remaining == 512) // Happens only when the block is all 1 and we are query the last element. Number 512 requires more than 9 bits to represent + { + return rr * rankBlockSize * WORDBITS + 511 ; + } + // Mark the lowest bit for every 9-bit block + const uint64_t l9 = 0x40201008040201ull ; + const uint64_t h9 = l9 << 8 ; // mark the highest bit for every 9-bit block + const uint64_t expandRem = remaining * l9 ; + uint64_t subrWord = rankR[rr * 2 + 1] ; + if (type == 0) // need to take corresponding complement to get the accumulate counts for 0 + { + //64 + ((64*2)<<(9*1)) + ((64*3)<<(9*2)) + ((64*4)<<(9*3)) + ((64*5)<<(9*4)) + ((64*6)<<(9*5)) + ((64*7)<<(9*6)) = 0x7030140803000040ull + subrWord = 0x7030140803010040ull - subrWord ; + } + uint64_t bitblockComp = BITBLOCK_LT(subrWord, expandRem, h9) ; + size_t subrr = (((bitblockComp >> 8) * l9) >> 54ull) & 7ull ; + // Processing the last WORD + size_t lastWi ; // index of the last word + lastWi = rr * rankBlockSize + subrr ; + if (lastWi >= rank.GetWordCnt()) + { + lastWi = rank.GetWordCnt() - 1 ; + subrr = lastWi - rr * rankBlockSize ; + } + WORD lastW = B[lastWi] ; + if (subrr > 0) + { + remaining -= ((subrWord >> ((subrr-1) * 9)) & 0x1ff) ; + } + + if (type == 0) + lastW = ~lastW ; + + return lastWi * WORDBITS + Utils::SelectInWord(lastW, remaining) ; + //return POSITIVE_INF ; // should not reach here. + } + else // _speed >= 4 + { + size_t skippedMiniBlocksInLong = _rankV[type].Query(si, _V[type], n, 0) * (_b/_minib) ; + size_t iMini = (i - 1) / _minib - skippedMiniBlocksInLong ; + if ( Utils::BitRead(_Vmini[type], iMini) ) + { + // long mini block + if ((i-1) % _minib == 0) + { + return l + _Imini[type].Read(iMini) ; + } + else + { + size_t iLongMini = _rankVmini[type].Query(iMini - skippedMiniBlocksInLong, + _Vmini[type], _VminiSize[type], 0) ; + //printf("%d\n", iLongMini * (_minib - 1) + (i - 1)%_minib - 1) ; + /*printf("b=%d _minib=%d. i=%d iMini=%d skippedMini=%d iLongMini=%d l=%d _Imini[iMini]=%d x=%d _Ilongmini[x]=%d. ret=%d\n", + b, _minib, i, iMini, skippedMiniBlocksInLong, + iLongMini, l, _Imini[type].Read(iMini), + iLongMini * (_minib - 1) + (i-1)%_minib - 1, _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1)%_minib - 1), + l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1)) ;*/ + return l + _Ilongmini[type].Read(iLongMini * (_minib - 1) + (i-1) % _minib - 1) ; + } + } + else + { + // short mini block + size_t offset = l + _Imini[type].Read(iMini) ; + WORD localw = Utils::BitsRead(B, offset, offset + _longMiniBlockLength - 2) ; + return offset + _precomputedShortMiniBlock[type].Read(localw * _minib + (i-1)%_minib) ; + } + } + } + else + { + // long block with sparse 1's + size_t iI = (_rankV[type].Query(si, _V[type], n) - 1) * (_b - 1); // block index in I + //printf("long block %d %d %d %d %d. %d\n", i, b, si, Utils::BitRead(V[type], si), iI, _I[type].Read(iI + (i - 1)%b - 1)) ; + return _I[type].Read(iI + (i - 1)%_b - 1) ; + } + } +public: + DS_Select() + { + _S[0] = _S[1] = NULL ; + _V[0] = _V[1] = NULL ; + _Vmini[0] = _Vmini[1] = NULL ; + _n = _totalOneCnt = _b = _space = 0 ; + } + + DS_Select(int blockSize, const WORD *B, const int &n, int selectSpeed, int selectTypeSupport) + { + Init(blockSize, B, n, selectSpeed, selectTypeSupport) ; + } + + ~DS_Select() { Free() ; } + + void Free() + { + int i ; + for (i = 0 ; i <= 1 ; ++i) + { + if (_S[i] != NULL) + { + free(_S[i]) ; + _S[i] = NULL ; + } + + if (_V[i] != NULL) + { + free(_V[i]) ; + _V[i] = NULL ; + } + _rankV[i].Free() ; + _I[i].Free() ; + + if (_Vmini[i] != NULL) + { + free(_Vmini[i]) ; + _Vmini[i] = NULL ; + } + _rankVmini[i].Free() ; + _Imini[i].Free() ; + _Ilongmini[i].Free() ; + _precomputedShortMiniBlock[i].Free() ; + } + _n = _b = 0 ; + } + + size_t GetSpace() { return _space + sizeof(*this); } + + // blockSize is the number of WORDs for each R + // selectTypeSupport: bit coding for whether allocate memory to support select0 and select1 + // 0-bit: select 0, 1-bit: selct1; so 3 means support both + void Init(int blockSize, const WORD *B, const size_t &n, int selectSpeed, int selectTypeSupport) + { + _speed = selectSpeed ; + this->_n = n ; + if (selectSpeed == 0 || selectTypeSupport == 0 || n <= 1) + return ; + size_t i, j ; + size_t wordCnt = Utils::BitsToWordBytes(n) / sizeof(WORD) ; + size_t *posBuffer = NULL; + _space = 0 ; + _b = blockSize ; + + // Set the parameters based the desired _speed + if (_b <= (int)WORDBITS) + { + _b = WORDBITS * WORDBITS; + if (_speed == 2) + _b = WORDBITS * Utils::Log2Ceil(n) ; //* Utils::Log2Ceil( Utils::Log2Ceil(n) ) ; + if (_speed == 3) + _b = WORDBITS * WORDBITS ; + if (_speed == 4) + _b = WORDBITS * Utils::Log2Ceil(n) ; + } + + int logn = Utils::Log2Ceil(n) ; + //_longBlockLength = _b * Utils::Log2Ceil(n) * Utils::Log2Ceil(n) ; // Two sampled 1's are too far apart. It should be b*log^2 n + _longBlockLength = logn * logn * logn * logn ; // Two sampled 1's are too far apart. It should be log^4 n + if (_longBlockLength < (unsigned int)_b) + _longBlockLength = _b ; + + _longMiniBlockLength = 0 ; + if (_speed == 2 || _speed == 3) + { + if (n >= (1<<30)) + _precomputeb = 16 ; // relate to precomputed select + else + _precomputeb = 8 ; + _precomputebElem = _precomputeb ; + if (_speed == 3) + { + _minib = 2 * WORDBITS ;//logn * logn ;//CEIL(sqrt((double)b)) ; + _minib -= _b % _minib ; + if (_minib < 3) + { + _minib = 3 ; + if (_b % 3) + _minib = 3 + _b%3 ; + } + } + } + else if (_speed == 4) + { + //_minib = sqrt(log n) + _minib = CEIL(pow((double)_b, 0.25)) ; // We make _minib depends on the choice of _b so it is easier to control the block size. + _minib -= _b % _minib ; + if (_minib < 3) + { + _minib = 3 ; + if (_b % 3) + _minib = 3 + _b%3 ; + } + _longMiniBlockLength = DIV_CEIL(_minib * _minib, 2) ; + posBuffer = (size_t*)malloc(sizeof(*posBuffer) * (_b+1)) ; + _precomputeb = _longMiniBlockLength - 1 ; + _precomputebElem = _minib ; + } + + _totalOneCnt = 0 ; + for (i = 0 ; i < wordCnt ; ++i) + _totalOneCnt += Utils::Popcount(B[i]) ; + + // Sample every other _b 1's (or 0's) + size_t blockCnt[2] ; + blockCnt[0] = DIV_CEIL((n - _totalOneCnt), _b) + 1 ; + blockCnt[1] = DIV_CEIL(_totalOneCnt, _b) + 1 ; + for (i = 0 ; i <= 1 ; ++i) + { + if (!(selectTypeSupport & (1<>j) & 1ull ; + if (!(selectTypeSupport & (1<= 2) + { + int k ; + for (k = 0 ; k <= 1 ; ++k) + { + if (!(selectTypeSupport & (1<= 3) + _Imini[k].Malloc(Utils::Log2Ceil(_longBlockLength), blockCnt[k] * (_b/_minib)) ; + + if (_speed >= 4) + { + _Vmini[k] = Utils::MallocByBits(blockCnt[k] * (_b / _minib)) ; + // The long mini block can be almost as larage as long block length - 1 + _Ilongmini[k].Malloc(Utils::Log2Ceil(_longBlockLength), DIV_CEIL(n, _longMiniBlockLength) * _minib) ; + } + + size_t newISize = 0 ; + size_t new_IminiSize = 0 ; + size_t new_IlongminiSize = 0 ; + for (i = 0 ; i < blockCnt[k] - 1 ; ++i) + { + if (_S[k][i + 1] - _S[k][i] >= _longBlockLength) + { + Utils::BitSet(_V[k], i) ; + // The first element is already stored in S, so no need to store it + for (j = _S[k][i] + 1 ; j < _S[k][i + 1] ; ++j) + { + if (Utils::BitRead(B, j) == k) + { + _I[k].Write(newISize, j) ; + ++newISize ; + } + } + + if (_speed == 3) // For _speed 3, we still fill up I mini + // so we don't need to acces _rankV for efficiency. + // Maybe I should do this to _speed 4 as well. + { + for (j = 0 ; j < (size_t)(_b / _minib) ; ++j) + { + _Imini[k].Write(new_IminiSize, 0) ; + ++new_IminiSize ; + } + } + } + else if (_speed >= 3) // short block case, we only need to process them when _speed==3 + { + int minicnt = 1; + size_t prevj = _S[k][i] ; + if (_speed >= 4) + posBuffer[0] = _S[k][i] ; + // j reaches the beginning of the next block so we can wrap up any unadded + // k's to the _miniblock. This handles both case that the last _miniblock in + // a block or the last _miniblock in the whole bit vector. + for (j = _S[k][i] + 1; j <= _S[k][i + 1] ; ++j) + { + int bit = 0 ; + if (j < n) + bit = Utils::BitRead(B, j) ; + if (bit == k || (j == _S[k][i + 1] && minicnt > 0)) + { + if (minicnt == _minib || (j == _S[k][i + 1] && minicnt > 0)) + { + _Imini[k].Write(new_IminiSize, prevj - _S[k][i]) ; + ++new_IminiSize ; + + if (_speed >= 4 && j - prevj >= _longMiniBlockLength) + { + int l ; + Utils::BitSet(_Vmini[k], new_IminiSize - 1) ; + for (l = 1 ; l < minicnt ; ++l) // we don't need to store the first element + { + _Ilongmini[k].Write(new_IlongminiSize, posBuffer[l] - _S[k][i]) ; + ++new_IlongminiSize ; + } + } + + prevj = j ; + minicnt = 0 ; + } + + if (bit == k) + { + if (_speed >= 4) + posBuffer[minicnt] = j ; + ++minicnt ; + } + } + } + } + } + _I[k].Resize(newISize) ; + _space += _I[k].GetSpace() - sizeof(_I[k]) ; + + _rankV[k].Init(_V[k], blockCnt[k]) ; + _space += _rankV[k].GetSpace() - sizeof(_rankV[k]) ; + + if (_speed >= 3) + { + _Imini[k].Resize(new_IminiSize) ; + _space += _Imini[k].GetSpace() - sizeof(_Imini[k]) ; + //printf("%d %d. %d. %d %d\n", _Imini[k].GetSpace(), new_IminiSize, Utils::Log2Ceil(_longBlockLength), _minib, n/_minib) ; + } + + if (_speed >= 4) + { + _Vmini[k] = (WORD *)realloc(_Vmini[k], + Utils::BitsToWordBytes(new_IminiSize)) ; + _VminiSize[k] = new_IminiSize ; + _space += Utils::BitsToWordBytes(new_IminiSize) ; + _rankVmini[k].Init(_Vmini[k], new_IminiSize) ; + _space += _rankVmini[k].GetSpace() - sizeof(_rankVmini[k]) ; + + _Ilongmini[k].Resize(new_IlongminiSize) ; + _space += _Ilongmini[k].GetSpace() - sizeof(_Ilongmini[k]) ; + } + } + } + + if (0 && _speed >= 2) // Now we are using Rank9 and bit operator, so no need for the precomputed element + { + // The precomputed short _miniblocks + unsigned int k ; + for (k = 0 ; k <= 1 ; ++k) + { + if (!(selectTypeSupport & (1<> l) & 1ull)==k) + { + _precomputedShortMiniBlock[k].Write(i * _precomputebElem + j, l) ; + ++j ; + if ((int)j >= _precomputebElem) + break ; + } + } + } + _space += _precomputedShortMiniBlock[k].GetSpace() - sizeof(_precomputedShortMiniBlock[k]) ; + } + if (_speed >= 4) + free(posBuffer) ; + } + } + + // Return the index of the ith (1-index ith) 1. + size_t Query(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const + { + return GeneralQuery(i, rank, B, n, 1) ; + } + + // Return the index of the ith (1-index ith) 0. + size_t Query0(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const + { + return GeneralQuery(i, rank, B, n, 0) ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _space) ; + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _speed) ; + + if (_speed == DS_SELECT_SPEED_NO || _n == 0) + return ; + + SAVE_VAR(fp, _longBlockLength) ; + SAVE_VAR(fp, _minib); + SAVE_VAR(fp, _longMiniBlockLength) ; + SAVE_VAR(fp, _b) ; + SAVE_VAR(fp, _totalOneCnt) ; + + size_t blockCnt[2] ; + blockCnt[0] = DIV_CEIL((_n - _totalOneCnt), _b) + 1 ; + blockCnt[1] = DIV_CEIL(_totalOneCnt, _b) + 1 ; + for (int i = 0 ; i <= 1 ; ++i) + { + size_t size = Utils::BitsToWords(blockCnt[i]) ; + fwrite(_S[i], sizeof(_S[i][0]), blockCnt[i], fp) ; + if (_speed >= 2) + { + fwrite(_V[i], sizeof(_V[i][0]), size, fp) ; + _rankV[i].Save(fp) ; + _I[i].Save(fp) ; + } + if (_speed >= 3) + _Imini[i].Save(fp) ; + } + } + + void Load(FILE *fp) + { + Free() ; + + LOAD_VAR(fp, _space) ; + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _speed) ; + + if (_speed == DS_SELECT_SPEED_NO || _n == 0) + return ; + + LOAD_VAR(fp, _longBlockLength) ; + LOAD_VAR(fp, _minib); + LOAD_VAR(fp, _longMiniBlockLength) ; + LOAD_VAR(fp, _b) ; + LOAD_VAR(fp, _totalOneCnt) ; + + size_t blockCnt[2] ; + blockCnt[0] = DIV_CEIL((_n - _totalOneCnt), _b) + 1 ; + blockCnt[1] = DIV_CEIL(_totalOneCnt, _b) + 1 ; + for (int i = 0 ; i <= 1 ; ++i) + { + size_t size = Utils::BitsToWords(blockCnt[i]) ; + _S[i] = (size_t *)malloc(sizeof(_S[i][0]) * blockCnt[i]) ; + fread(_S[i], sizeof(_S[i][0]), blockCnt[i], fp) ; + + if (_speed >= 2) + { + _V[i] = Utils::MallocByBits(blockCnt[i]) ; + fread(_V[i], sizeof(_V[i][0]), size, fp) ; + _rankV[i].Load(fp) ; + _I[i].Load(fp) ; + } + + if (_speed >= 3) + _Imini[i].Load(fp) ; + } + } +} ; +} + +#endif diff --git a/compactds/DS_Select_Test.hpp b/compactds/DS_Select_Test.hpp new file mode 100644 index 0000000..42303b5 --- /dev/null +++ b/compactds/DS_Select_Test.hpp @@ -0,0 +1,537 @@ +#ifndef _MOURISL_COMPACTDS_DS_SELECT_TEST +#define _MOURISL_COMPACTDS_DS_SELECT_TEST + +#include "Utils.hpp" +#include "DS_Rank.hpp" + +// The standalone data structe for select query on a plain bitvector with precomputed rank information +// n - bitvector length. m - number of 1s (or 0s for select0) +// Speed 1: Time complexity: O(log n/m) [space: O(n/w)] +// Speed 2: Time complexity: O(log log n) [space: O(n/log n)] +// Most of the space are reuse the rank structure though, +// so in practice the extra space is stil only O(n/w) +// Seems speed 3, 4 does not work properly... +// Speed 3: Time complexity: O(log log n) [space: O(n/log n)] +// inspired by the implementation in SDSL +// Speed 4: Time complexity: O(1) [space: O(n loglog n / sqrt(log n) + sqrt(n))] +// The textbook O(n/log log n)-space algorithm has too large factor +// for precomputed short miniblocks. Not very pratical. +// + +#define DS_SELECT_SPEED_NO 0 +#define DS_SELECT_SPEED_SAMPLED 1 +#define DS_SELECT_SPEED_RANKBINARY 2 +#define DS_SELECT_SPEED_DENSESAMPLE 3 +#define DS_SELECT_SPEED_CONSTANT 4 + +namespace compactds { +class DS_Select_Test +{ +private: + size_t *S[2] ; // sampled position for 0's and 1's + + // Data structures for long blocks + size_t longBlockLength ; + WORD *V[2] ; // indicator whether a S block is long (1) or short + DS_Rank rankV[2] ; + FixedSizeElemArray I[2] ; // precomputed index within long block + + int precomputeb ; // the precomputed offsets within a word of size b + int precomputebElem ; // how many 1s we should consider for such word. + + int minib ; // mini block size (the number of 1's) + size_t longMiniBlockLength ; // long mini block length, for speed==3,4 + WORD *Vmini[2] ; // indicator whether a S block is long mini or not + size_t VminiSize[2] ; + DS_Rank rankVmini[2] ; + FixedSizeElemArray Imini[2] ; // offset for the beginning of mini block + FixedSizeElemArray Ilongmini[2] ; // offset for each element in long-mini block + + // Concatenated precomputed short mini block's S. We need concatenation, otherwise too + // much overhead in the FixedSizeElemArray structure. + // Even without FixedSizeElemArray, the pointers will take too much space. + FixedSizeElemArray precomputedShortMiniBlock[2] ; + + int b ; // block size (the number of 1's in a block) or sampling rate + size_t n ; + size_t totalOneCnt ; + + size_t space ; + + int speed ; // 0: do not allocate; 1: slow, 2: medium, 3: medium-fast 4: fastest, constant time + + // The select that handles both types (0 or 1). + size_t GeneralQuery(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n, int type) const + { + if (i < 1 || i > n) + return POSITIVE_INF ; + if (n == 1) // i must == 1 here + return 1 ; + + size_t si = (i - 1) / b ; + + size_t l, m, r ; // variables for binary search. They are coordinates on B + l = S[type][si] ; + r = S[type][si + 1] ; + if ((i - 1) % b == 0) + return l ; + + if (speed == 1 || r - l < longBlockLength) // r-l is more efficient than V.access. + { + if (speed == 3) + { + // Adjust l, r using miniblocks + size_t oldl = l ; + l = oldl + Imini[type].Read((i - 1) / minib) ; + if ((i - 1) / minib + 1 < (unsigned int)(b / minib)) // only adjust r if it is not in the last miniblock in current block + r = oldl + Imini[type].Read((i - 1) / minib + 1) ; + } + if (speed <= 3) + { + --r ; + + // Locate the R + uint64_t *rankR = rank.GetR() ; + const int rankBlockSize = rank.GetBlockSize() ; // this block size is with respect to WORD + size_t rl, rr ; + size_t tmp ; + rl = l / (rankBlockSize * WORDBITS) ; + rr = r / (rankBlockSize * WORDBITS) ; + while (rl <= rr) + { + m = (rl + rr) / 2 ; + tmp = rankR[m << 1] ; + if (type == 0) + tmp = m * rankBlockSize * WORDBITS - tmp ; + + if (tmp < i) + rl = m + 1 ; + else + rr = m - 1 ; // rankR[0]==0 makes sure m>=1 in the process + } + + // Locate the subR + size_t remaining ; + if (type == 1) + remaining = i - rankR[rr<<1] ; + else + remaining = i - (rr * (rankBlockSize * WORDBITS) - rankR[rr<<1]) ; + + int subrl, subrr ; + subrl = 0 ; // the first subr has offset 0, so we don't store them. + subrr = 6 ; + if (rr * rankBlockSize + 1 + subrr >= rank.GetWordCnt()) + subrr = rank.GetWordCnt() - 2 - rr * rankBlockSize; + bool inFirstSubBlock = false ; + if (rank.GetWordCnt() <= 1 + || (type == 1 && rank.DecodeSubR(rr, 0) >= remaining) + || (type == 0 && WORDBITS - rank.DecodeSubR(rr, 0) >= remaining) + || subrr < subrl) // The case that the last block has only one subblock, which will not be allocated + inFirstSubBlock = true ; + + if ( !inFirstSubBlock ) + { + size_t rword = rankR[2 * rr + 1] ; + while (subrl <= subrr) + { + m = (subrl + subrr) / 2 ; + tmp = (rword >> (m * 9ull)) & 0x1ff ; + //printf("%d %d. %llu\n", m, tmp, rword) ; + if (type == 0) + tmp = (m + 1) * WORDBITS - tmp ; // plus 1 here to incorporate the first sub block + if (tmp < remaining) + subrl = m + 1 ; + else + subrr = m - 1 ; // the in firstsubblock test makes sure this part won't under-flow + } + + if (type == 1) + remaining -= (rword >> (subrr * 9)) & 0x1ff ; + else + remaining -= ((subrr + 1) * WORDBITS - ((rword >> (subrr * 9)) & 0x1ff)) ; + } + + // Processing the last WORD + size_t lastWi = 0 ; // index of the last word + WORD lastW = 0 ; + if (inFirstSubBlock) + lastWi = rr * rankBlockSize ; + else + lastWi = rr * rankBlockSize + subrr + 1 ; // here the rr is to compensate for the first subblock missed in every sub block + lastW = B[lastWi] ; + size_t j ; + + int sum = 0 ; + for (j = 0 ; j < WORDBITS ; j += precomputeb) + { + WORD x = (lastW >> j) & MASK(precomputeb) ; + int tmp = Utils::Popcount(x) ; + if (type == 0) + tmp = precomputeb - tmp ; + if (sum + tmp >= (int)remaining) + { + return lastWi * WORDBITS + j + precomputedShortMiniBlock[type].Read(x * precomputebElem + remaining - sum - 1) ; + } + sum += tmp ; + } + return POSITIVE_INF ; // should not reach here. + } + else // speed >= 4 + { + size_t skippedMiniBlocksInLong = rankV[type].Query(si, V[type], n, 0) * (b/minib) ; + size_t iMini = (i - 1) / minib - skippedMiniBlocksInLong ; + if ( Utils::BitRead(Vmini[type], iMini) ) + { + // long mini block + if ((i-1) % minib == 0) + { + return l + Imini[type].Read(iMini) ; + } + else + { + size_t iLongMini = rankVmini[type].Query(iMini - skippedMiniBlocksInLong, + Vmini[type], VminiSize[type], 0) ; + //printf("%d\n", iLongMini * (minib - 1) + (i - 1)%minib - 1) ; + /*printf("b=%d minib=%d. i=%d iMini=%d skippedMini=%d iLongMini=%d l=%d Imini[iMini]=%d x=%d Ilongmini[x]=%d. ret=%d\n", + b, minib, i, iMini, skippedMiniBlocksInLong, + iLongMini, l, Imini[type].Read(iMini), + iLongMini * (minib - 1) + (i-1)%minib - 1, Ilongmini[type].Read(iLongMini * (minib - 1) + (i-1)%minib - 1), + l + Ilongmini[type].Read(iLongMini * (minib - 1) + (i-1) % minib - 1)) ;*/ + return l + Ilongmini[type].Read(iLongMini * (minib - 1) + (i-1) % minib - 1) ; + } + } + else + { + // short mini block + size_t offset = l + Imini[type].Read(iMini) ; + WORD localw = Utils::BitsRead(B, offset, offset + longMiniBlockLength - 2) ; + return offset + precomputedShortMiniBlock[type].Read(localw * minib + (i-1)%minib) ; + } + } + } + else + { + // long block with sparse 1's + size_t iI = (rankV[type].Query(si, V[type], n) - 1) * (b - 1); // block index in I + //printf("long block %d %d %d %d %d. %d\n", i, b, si, Utils::BitRead(V[type], si), iI, I[type].Read(iI + (i - 1)%b - 1)) ; + return I[type].Read(iI + (i - 1)%b - 1) ; + } + } +public: + DS_Select_Test() + { + S[0] = S[1] = NULL ; + V[0] = V[1] = NULL ; + Vmini[0] = Vmini[1] = NULL ; + n = totalOneCnt = b = space = 0 ; + } + + DS_Select_Test(int blockSize, const WORD *B, const int &n, int selectSpeed, int selectTypeSupport) + { + Init(blockSize, B, n, selectSpeed, selectTypeSupport) ; + } + + ~DS_Select_Test() { Free() ; } + + void Free() + { + int i ; + for (i = 0 ; i <= 1 ; ++i) + { + if (S[i] != NULL) + { + free(S[i]) ; + S[i] = NULL ; + } + + if (V[i] != NULL) + { + free(V[i]) ; + V[i] = NULL ; + } + rankV[i].Free() ; + I[i].Free() ; + + if (Vmini[i] != NULL) + { + free(Vmini[i]) ; + Vmini[i] = NULL ; + } + rankVmini[i].Free() ; + Imini[i].Free() ; + Ilongmini[i].Free() ; + precomputedShortMiniBlock[i].Free() ; + } + n = b = 0 ; + } + + size_t GetSpace() { return space + sizeof(*this); } + + // blockSize is the number of WORDs for each R + // selectTypeSupport: bit coding for whether allocate memory to support select0 and select1 + // 0-bit: select 0, 1-bit: selct1; so 3 means support both + void Init(int blockSize, const WORD *B, const size_t &n, int selectSpeed, int selectTypeSupport) + { + if (selectSpeed == 0 || selectTypeSupport == 0 || n <= 1) + return ; + size_t i, j ; + size_t wordCnt = Utils::BitsToWordBytes(n) / sizeof(WORD) ; + size_t *posBuffer = NULL; + this->n = n ; + speed = selectSpeed ; + space = 0 ; + b = blockSize ; + + // Set the parameters based the desired speed + if (b <= (int)WORDBITS) + { + b = WORDBITS * WORDBITS; + if (speed >= 2) + b = WORDBITS * Utils::Log2Ceil(n) ; //* Utils::Log2Ceil( Utils::Log2Ceil(n) ) ; + if (speed == 4) + b = WORDBITS * Utils::Log2Ceil(n) ; + } + + longBlockLength = b * Utils::Log2Ceil(n) * Utils::Log2Ceil(n) ; // Two sampled 1's are too far apart. It should be b*log^2 n + if (speed == 2 || speed == 3) + { + if (n >= (1<<30)) + precomputeb = 16 ; // relate to precomputed select + else + precomputeb = 8 ; + precomputebElem = precomputeb ; + if (speed == 3) + { + minib = CEIL(sqrt((double)b)) ; + minib -= b % minib ; + if (minib < 3) + { + minib = 3 ; + if (b % 3) + minib = 3 + b%3 ; + } + } + } + else if (speed == 4) + { + //minib = sqrt(log n) + minib = CEIL(pow((double)b, 0.25)) ; // We make minib depends on the choice of b so it is easier to control the block size. + minib -= b % minib ; + if (minib < 3) + { + minib = 3 ; + if (b % 3) + minib = 3 + b%3 ; + } + longMiniBlockLength = DIV_CEIL(minib * minib, 2) ; + posBuffer = (size_t*)malloc(sizeof(*posBuffer) * (b+1)) ; + precomputeb = longMiniBlockLength - 1 ; + precomputebElem = minib ; + } + + totalOneCnt = 0 ; + for (i = 0 ; i < wordCnt ; ++i) + totalOneCnt += Utils::Popcount(B[i]) ; + + // Sample every other b 1's (or 0's) + size_t blockCnt[2] ; + blockCnt[0] = DIV_CEIL((n - totalOneCnt), b) + 1 ; + blockCnt[1] = DIV_CEIL(totalOneCnt, b) + 1 ; + for (i = 0 ; i <= 1 ; ++i) + { + if (!(selectTypeSupport & (1<>j) & 1ull ; + if (!(selectTypeSupport & (1<= 2) + { + int k ; + for (k = 0 ; k <= 1 ; ++k) + { + if (!(selectTypeSupport & (1<= 3) + Imini[k].Malloc(Utils::Log2Ceil(longBlockLength), blockCnt[k] * (b/minib)) ; + + if (speed >= 4) + { + Vmini[k] = Utils::MallocByBits(blockCnt[k] * (b / minib)) ; + // The long mini block can be almost as larage as long block length - 1 + Ilongmini[k].Malloc(Utils::Log2Ceil(longBlockLength), DIV_CEIL(n, longMiniBlockLength) * minib) ; + } + + size_t newISize = 0 ; + size_t newIminiSize = 0 ; + size_t newIlongminiSize = 0 ; + for (i = 0 ; i < blockCnt[k] - 1 ; ++i) + { + if (S[k][i + 1] - S[k][i] >= longBlockLength) + { + Utils::BitSet(V[k], i) ; + // The first element is already stored in S, so no need to store it + for (j = S[k][i] + 1 ; j < S[k][i + 1] ; ++j) + { + if (Utils::BitRead(B, j) == k) + { + I[k].Write(newISize, j) ; + ++newISize ; + } + } + + if (speed == 3) // For speed 3, we still fill up I mini + // so we don't need to acces rankV for efficiency. + // Maybe I should do this to speed 4 as well. + { + for (i = 0 ; i < (size_t)(b / minib) ; ++i) + { + Imini[k].Write(newIminiSize, 0) ; + ++newIminiSize ; + } + } + } + else if (speed >= 3) // short block case, we only need to process them when speed==3 + { + int minicnt = 1; + size_t prevj = S[k][i] ; + if (speed >= 4) + posBuffer[0] = S[k][i] ; + // j reaches the beginning of the next block so we can wrap up any unadded + // k's to the miniblock. This handles both case that the last miniblock in + // a block or the last miniblock in the whole bit vector. + for (j = S[k][i] + 1; j <= S[k][i + 1] ; ++j) + { + int bit = 0 ; + if (j < n) + bit = Utils::BitRead(B, j) ; + if (bit == k || (j == S[k][i + 1] && minicnt > 0)) + { + if (minicnt == minib || (j == S[k][i + 1] && minicnt > 0)) + { + Imini[k].Write(newIminiSize, prevj - S[k][i]) ; + ++newIminiSize ; + + if (speed >= 4 && j - prevj >= longMiniBlockLength) + { + int l ; + Utils::BitSet(Vmini[k], newIminiSize - 1) ; + for (l = 1 ; l < minicnt ; ++l) // we don't need to store the first element + { + Ilongmini[k].Write(newIlongminiSize, posBuffer[l] - S[k][i]) ; + ++newIlongminiSize ; + } + } + + prevj = j ; + minicnt = 0 ; + } + + if (bit == k) + { + if (speed >= 4) + posBuffer[minicnt] = j ; + ++minicnt ; + } + } + } + } + } + I[k].Resize(newISize) ; + space += I[k].GetSpace() - sizeof(I[k]) ; + + rankV[k].Init(-1, V[k], blockCnt[k]) ; + space += rankV[k].GetSpace() - sizeof(rankV[k]) ; + + if (speed >= 3) + { + Imini[k].Resize(newIminiSize) ; + space += Imini[k].GetSpace() - sizeof(Imini[k]) ; + //printf("%d %d. %d. %d %d\n", Imini[k].GetSpace(), newIminiSize, Utils::Log2Ceil(longBlockLength), minib, n/minib) ; + } + + if (speed >= 4) + { + Vmini[k] = (WORD *)realloc(Vmini[k], + Utils::BitsToWordBytes(newIminiSize)) ; + VminiSize[k] = newIminiSize ; + space += Utils::BitsToWordBytes(newIminiSize) ; + rankVmini[k].Init(-1, Vmini[k], newIminiSize) ; + space += rankVmini[k].GetSpace() - sizeof(rankVmini[k]) ; + + Ilongmini[k].Resize(newIlongminiSize) ; + space += Ilongmini[k].GetSpace() - sizeof(Ilongmini[k]) ; + } + } + } + + if (speed >= 2) + { + // The precomputed short miniblocks + unsigned int k ; + for (k = 0 ; k <= 1 ; ++k) + { + if (!(selectTypeSupport & (1<> l) & 1ull)==k) + { + precomputedShortMiniBlock[k].Write(i * precomputebElem + j, l) ; + ++j ; + if ((int)j >= precomputebElem) + break ; + } + } + } + space += precomputedShortMiniBlock[k].GetSpace() - sizeof(precomputedShortMiniBlock[k]) ; + } + if (speed >= 4) + free(posBuffer) ; + } + } + + // Return the index of the ith (1-index ith) 1. + size_t Query(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const + { + return GeneralQuery(i, rank, B, n, 1) ; + } + + // Return the index of the ith (1-index ith) 0. + size_t Query0(size_t i, const DS_Rank9 &rank, const WORD *B, const size_t &n) const + { + return GeneralQuery(i, rank, B, n, 0) ; + } +} ; +} +#endif diff --git a/compactds/DifferenceCover.hpp b/compactds/DifferenceCover.hpp new file mode 100644 index 0000000..7e1aa76 --- /dev/null +++ b/compactds/DifferenceCover.hpp @@ -0,0 +1,201 @@ +#ifndef _MOURISL_COMPACTDS_DIFFERENCECOVER +#define _MOURISL_COMPACTDS_DIFFERENCECOVER + +#include "Utils.hpp" +#include "SimpleVector.hpp" + +#include +#include + +// The class handling difference covers +// Difference cover is a set of numbers D={a_0, ... a_{m-1}} in range [0, v) +// such that every i in [0,1) there is some a_j, a_k in D s.t. i=(a_j-a_k)%v. +// So the name comes from the differences of a set can cover all the element. +// This class also handles when query elements larger than v (cyclic difference cover) +namespace compactds { +class DifferenceCover +{ +private: + int v ; // period size + int *dcs ; // DCs + int m ; // number of DCs + std::map dcMap ; // maybe replace this with a bit vector later + int *precomputedD ; // precomputed information for Delta query + + int GetB(int i, int r) + { + if (i < r) + return 1 ; + else if (i < r + 1) + return r + 1 ; + else if (i < 2 * r + 1) + return 2 * r + 1; + else if (i < 4 * r + 2) + return 4 * r + 3 ; + else if (i < 5 * r + 3) + return 2 * r + 2 ; + else if (i < 6 * r + 3) + return 1 ; + else + return 0 ; // ERROR + } +public: + DifferenceCover() + { + v = 4096 ; + dcs = NULL ; + m = 0 ; + } + + ~DifferenceCover() + { + if (dcs) + { + free(dcs) ; + free(precomputedD) ; + } + } + + // The construction is based on Colbourn, Ling 2000 + void Init(int v) + { + int i ; + if (v <= 13) + v = 14 ; + + this->v = v ; + // Use the Colbourn, Ling method to find the cover + int r = CEIL((-36 + sqrt(1296 - 96*(13 - v)))/48.0) ; + SimpleVector rawdcs ; + rawdcs.Reserve(6 * r + 4) ; + rawdcs.PushBack(0) ; + for (i = 1 ; i <= 6 * r + 3 ; ++i) + rawdcs.PushBack( rawdcs[i - 1] + GetB(i - 1, r)) ; + + // Put the finalized difference cover + m = 0 ; + for (i = 0 ; i < 6 * r + 4 ; ++i) + { + int dc = rawdcs[i] % v ; + if (dcMap.find(dc) == dcMap.end()) + { + dcMap[dc] = m ; + ++m ; + } + } + + dcs = (int *)malloc(sizeof(dcs[0]) * m) ; + i = 0 ; + for (std::map::iterator it = dcMap.begin() ; it != dcMap.end() ; ++it, ++i) + { + dcs[i] = it->first ; + } + + // Reorder them into increasing order + std::sort(dcs, dcs + m) ; + for (i = 0 ; i < m ; ++i) + { + dcMap[dcs[i]] = i ; + } + + // Precompute the look up table d for Delta query + // Lemma 4 in Fast Lightweight Suffix Array Construction and Checking + // We can enumerate all the differences from D + int j ; + precomputedD = (int *)malloc(sizeof(precomputedD[0]) * v) ; + memset(precomputedD, -1, sizeof(precomputedD[0]) * v) ; + precomputedD[0] = 0 ; + for (i = 0 ; i < m ; ++i) + { + for (j = 0 ; j < m ; ++j) + { + int d = dcs[j] - dcs[i] ; + if (d < 0) + d += v ; + precomputedD[d] = dcs[i] ; + } + } + } + + static size_t EstimateCoverSize(int v) + { + if (v <= 13) + return POSITIVE_INF ; + int r = CEIL((-36 + sqrt(1296 - 96*(13 - v)))/48.0) ; + return 6 * r + 4 ; + } + + // Check whether an element is in diff-cover + bool IsInDC(size_t i) + { + if (dcMap.find(i%v) != dcMap.end()) + return true ; + return false ; + } + + int GetV() + { + return v ; + } + + // Get the size of the DC that can cover [0, n) + size_t GetSize(size_t n) + { + int i ; + for (i = 0 ; i < m ; ++i) + { + if (dcs[i] >= (int)(n % v)) + break ; + } + return n / v * m + i ; + } + + // Return the difference cover in a list to cover [0, n) + size_t GetDiffCoverList(size_t n, size_t *dcList) + { + int i ; + size_t c ; + size_t cycleCnt = DIV_CEIL(n, v) ; + size_t ret = 0 ; + for (c = 0 ; c < cycleCnt ; ++c) + { + for (i = 0 ; i < m ; ++i) + { + size_t x = c * v + dcs[i] ; + if (x >= n) + break ; + dcList[ret] = x ; + ++ret ; + } + } + return ret ; + } + + // Return the index when skipping the non-DC elements in the list + // Assume i is in the difference cover. + size_t CompactIndex(size_t i) + { + return i / v * m + dcMap[i % v] ; + //int k = dcMap[i%v] ; + //return (n / v) * k + (k < coverCntInLastCycle ? k : coverCntInLastCycle) + i / v ; + } + + // Return the offset delta that (i+delta)%v and (j+delta)%v is in the difference cover + int Delta(size_t i, size_t j) + { + int ri = i % v ; + int rj = j % v ; + + int d = (rj - ri)%v ; + if (d < 0) + d += v ; + d = (precomputedD[d] - ri)%v ; + if (d < 0) + d += v ; + + return d ; + } +} ; +} + +#endif diff --git a/compactds/EliasCode.hpp b/compactds/EliasCode.hpp new file mode 100644 index 0000000..48e8b27 --- /dev/null +++ b/compactds/EliasCode.hpp @@ -0,0 +1,74 @@ +#ifndef _MOURISL_COMPACTDS_ELIASCODE +#define _MOURISL_COMPACTDS_ELIASCODE + +#include "Utils.hpp" + +namespace compactds { +class EliasCode +{ +public: + EliasCode() {} + ~EliasCode() {} + + // These function will output + // These methods can only encode positive numbers. + // The bits are also reversed so accessing them is easier. + // Even though the input value is 32-bit, the encoded bits can be greater than 32-bit. + static WORD Unary(int in, int &l) + { + l = in ; + return 1ull << (in - 1); + } + + // Elias gamma + static WORD Gamma(int in, int &l) + { + int i ; + const int n = Utils::CountBits(in) ; + WORD ret = Unary(n, l) ; + // the rightmost bit of Unary(n) and the leftmost bit of in are both 1, so we only need to shift by once. + for (i = n - 2 ; i >= 0 ; --i, ++l) + { + ret |= (((in>>i)&1ull) << l) ; + } + //printf("%s: %d => %d %d %d\n", __func__, in, ret, n, l); + return ret ; + } + + // Elias delta + static WORD Delta(int in, int &l) + { + int i ; + int n = Utils::CountBits(in) ; + WORD ret = Gamma(n, l); + + for (i = n - 2 ; i >= 0 ; --i, ++l) + ret |= (((in>>i)&1) << l) ; + return ret ; // the leftmost bit of in is implicitly 1. + } + + // Read in one Gamma encoded word starting from W's ith bits + // return: the value; l - # of processed bits + static int ReadOneGamma(WORD *W, size_t i, int &l) + { + size_t j, k ; + // Determine the length + for (j = i ; Utils::BitRead(W, j) == 0 ; ++j) + ; + l = j - i + 1 ; + int ret = 1 ; + for (k = j + 1 ; k < j + l ; ++k) + ret = (ret << 1) | Utils::BitRead(W, k) ; + l = k - i ; + return ret ; + } + + // TODO: implement this + static int ReadOneDelta(WORD *W, size_t i, int &out) + { + return 0 ; + } +} ; +} + +#endif diff --git a/compactds/FMBuilder.hpp b/compactds/FMBuilder.hpp new file mode 100644 index 0000000..d595467 --- /dev/null +++ b/compactds/FMBuilder.hpp @@ -0,0 +1,504 @@ +#ifndef _MOURISL_COMPACTDS_FM_BUILDER +#define _MOURISL_COMPACTDS_FM_BUILDER + +// Build BWT and other auxiliary datas from text T using blockwise suffix array sorting + +#include +#include + +#include + +#include "Utils.hpp" +#include "SuffixArrayGenerator.hpp" + +namespace compactds { +struct _FMBuilderParam +{ + size_t n ; + + size_t saBlockSize ; + int saDcv ; + size_t threadCnt ; + + int sampleRate ; + int sampleStrategy ; // on SA, on T or on the ends of BWT runs. + size_t sampleSize ; + size_t *sampledSA ; + + int precomputeWidth ; + size_t precomputeSize ; + std::pair *precomputedRange ; + + bool printLog ; + + size_t maxLcp ; // only consider LCP up to this point + + std::map selectedISA ; + std::map selectedSA ; // reverse selectedISA + + WORD *semiLcpGreater ; // The LCP is between current suffix and its previous one + WORD *semiLcpEqual ; + + size_t adjustedSA0 ; // specialized sampled SA. + + FILE *dumpSaFp ; // dump SA to this file. + + _FMBuilderParam() + { + sampleStrategy = 0 ; + saBlockSize = 1<<24 ; + saDcv = 4096 ; + sampleRate = 1<<5 ; + threadCnt = 1 ; // the number of threads for sorting. + precomputeWidth = 10 ; + adjustedSA0 = 0 ; + + printLog = true ; + + maxLcp = 0 ; + dumpSaFp = NULL ; + + // The memory for these arrays shall handled explicitly outside. + sampledSA = NULL ; + precomputedRange = NULL ; + semiLcpGreater = NULL ; + semiLcpEqual = NULL ; + } + + // Use this free with caution, + // as some of the pointer will be + // used in FMIndexAuxData. + // Use this only when generating BWT string. + void Free() + { + if (sampledSA != NULL) + free(sampledSA) ; + if (precomputedRange != NULL) + free(precomputedRange) ; + if (semiLcpGreater != NULL) + free(semiLcpGreater) ; + if (semiLcpEqual != NULL) + free(semiLcpEqual) ; + } +} ; + +struct _FMBuilderChunkThreadArg +{ + int tid ; + int threadCnt ; + + FixedSizeElemArray *T ; + size_t n ; + + SuffixArrayGenerator *saGenerator ; + size_t from, to ; + std::vector< std::vector > pos ; +} ; + +struct _FMBuilderSASortThreadArg +{ + int tid ; + int threadCnt ; + + FixedSizeElemArray *T ; + size_t n ; + + SuffixArrayGenerator *saGenerator ; + size_t *sa ; + size_t saSize ; + + size_t accuChunkSize ; + + WORD *semiLcpGreater ; // The LCP is between current suffix and its previous one + WORD *semiLcpEqual ; + size_t maxLcp ; // only consider LCP up to this point +} ; + +class FMBuilder +{ +private: + // Return the LCP up until the specified value bewteen T[i,...], T[j,...] + static size_t ComputeSemiLcp(FixedSizeElemArray &T, size_t n, size_t i, size_t j, size_t maxLCP) + { + size_t k ; + if (i >= n || j >= n || i < 0 || j < 0) + return 0 ; + else + { + for (k = 0 ; k < maxLCP && i + k < n && j + k < n ; ++k) + { + if (T.Read(i + k) != T.Read(j + k)) + break ; + } + return k ; + } + } + + static void *PosInChunk_Thread(void *arg) + { + struct _FMBuilderChunkThreadArg *pArg = (_FMBuilderChunkThreadArg *)arg ; + size_t segLen = DIV_CEIL(pArg->n, pArg->threadCnt) ; + size_t s = segLen * pArg->tid ; + size_t e = s + segLen - 1 ; + pArg->saGenerator->GetChunksPositions(*(pArg->T), pArg->n, + pArg->from, pArg->to, s, e, pArg->pos) ; + pthread_exit(NULL) ; + } + + // Compare the semiLCP between T[sai...], and T[saj,...], write the result to semiLcp[biti] + static void SetSemiLcpBit(FixedSizeElemArray &T, size_t n, size_t sai, size_t saj, size_t biti, size_t maxLcp, WORD *semiLcpGreater, WORD *semiLcpEqual) + { + size_t l = 0 ; + l = ComputeSemiLcp(T, n, sai, saj, maxLcp + 1) ; + if (l > maxLcp) + Utils::BitSet(semiLcpGreater, biti) ; + else if (l == maxLcp) + Utils::BitSet(semiLcpEqual, biti) ; + } + + static void *SortSA_Thread(void *arg) + { + struct _FMBuilderSASortThreadArg *pArg = (struct _FMBuilderSASortThreadArg *)arg ; + pArg->saGenerator->SortSuffixByPos(*(pArg->T),pArg->n, + pArg->sa, pArg->saSize, pArg->sa ) ; + //printf("TEST %d\n", saSortThreadArgs[0][0].sa[0]) ; + + if (pArg->maxLcp > 0) + { + size_t i ; + // The first element's LCP is between the last element from previous + // chunk, need to process outside. + for (i = 1 ; i < pArg->saSize ; ++i) + SetSemiLcpBit(*(pArg->T), pArg->n, pArg->sa[i], pArg->sa[i - 1], pArg->accuChunkSize + i, + pArg->maxLcp, pArg->semiLcpGreater, pArg->semiLcpEqual) ; + } + + pthread_exit(NULL) ; + } + +public: + // Allocate and init the memorys for auxiliary data arrays in FM index + // chrbit: number of bits for each character + static void MallocAuxiliaryData(size_t chrbit, size_t n, struct _FMBuilderParam ¶m) + { + size_t i ; + param.n = n ; + + param.sampleSize = DIV_CEIL(n, param.sampleRate) ; + param.sampledSA = (size_t *)malloc(sizeof(size_t) * DIV_CEIL(n, param.sampleRate)) ; + + size_t size = 1ull<<(chrbit * param.precomputeWidth) ; + param.precomputeSize = size ; + param.precomputedRange = (std::pair *)malloc( + sizeof(std::pair) * size) ; + for (i = 0 ; i < size ; ++i) + { + param.precomputedRange[i].first = 0 ; + param.precomputedRange[i].second = 0 ; + } + + if (param.maxLcp > 0) + { + param.semiLcpGreater = Utils::MallocByBits(n) ; + param.semiLcpEqual = Utils::MallocByBits(n) ; + } + } + + // Determine the parameters for block size and difference cover size + // based on memory requirement (bytes). + // Assume mem is quite large. + static void InferParametersGivenMemory(size_t n, int alphabetSize, size_t memory, + struct _FMBuilderParam ¶m) + { + size_t logBlockSize ; + size_t dcv ; + size_t alphabetBits = Utils::Log2Ceil(alphabetSize) ; + + size_t bestTime = POSITIVE_INF ; + size_t bestBlockSize = 0 ; + size_t bestDcv = 0 ; + + if (2 * n * alphabetBits / 8 > memory) + return ; + + memory -= 2 * n * alphabetBits / 8 ; + for (dcv = 512 ; dcv <= 8196 ; dcv *= 2) + { + size_t dcSize = DIV_CEIL(n, dcv) * DifferenceCover::EstimateCoverSize(dcv) ; + for (logBlockSize = 24 ; logBlockSize <= 50 ; ++logBlockSize) + { + size_t blockSize = 1ull<= n / param.threadCnt) + // break ; + + size_t space = (param.threadCnt * blockSize + + dcSize + DIV_CEIL(n, param.sampleRate) + + (1ull<<(alphabetBits * param.precomputeWidth))*2 + ) * WORDBYTES ; + + if (space <= memory) + { + size_t iterations = DIV_CEIL(n, (param.threadCnt * blockSize)) ; + size_t time = dcSize * Utils::Log2Ceil(n) // sort difference cover + + iterations * n * dcv // making cuts + + iterations * (blockSize * Utils::Log2Ceil(blockSize) + dcv * blockSize) ; // sort block + //printf("%lu(%lu) %lu %lu. %lu %lu. %lu\n", dcv, dcSize, blockSize, iterations, + // space, time, memory) ; + if (time < bestTime) + { + bestBlockSize = blockSize ; + bestDcv = dcv ; + bestTime = time ; + } + } + else + break ; + } + } + + if (bestDcv != 0) + { + param.saBlockSize = bestBlockSize ; + param.saDcv = bestDcv ; + + if (param.printLog) + { + Utils::PrintLog("Estimated block size: %lu; dcv:%d", + param.saBlockSize, param.saDcv) ; + } + } + } + + // T: text + // n: len(text) + // firstISA: ISA[0] + // Returned information is in BWT, firstISA, which are important in the F column. param holds all the other allocated array. + static void Build(FixedSizeElemArray &T, size_t n, int alphabetSize, + FixedSizeElemArray &BWT, size_t &firstISA, + struct _FMBuilderParam ¶m) + { + size_t i, j, k ; + SuffixArrayGenerator saGenerator ; + MallocAuxiliaryData(Utils::Log2Ceil(alphabetSize), n, param) ; + BWT.Malloc(Utils::Log2Ceil(alphabetSize), n) ; + if (param.printLog) + Utils::PrintLog("Generate difference cover and chunks.") ; + size_t cutCnt = saGenerator.Init(T, n, param.saBlockSize, param.saDcv, alphabetSize) ; + if (param.printLog) + Utils::PrintLog("Found %llu chunks.", cutCnt) ; + size_t bwtFilled = 0 ; + + pthread_t *threads = (pthread_t *)malloc(sizeof(*threads) * param.threadCnt) ; + struct _FMBuilderChunkThreadArg *chunkThreadArgs ; + struct _FMBuilderSASortThreadArg *saSortThreadArgs ; + pthread_attr_t attr ; + + pthread_attr_init( &attr ) ; + pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ) ; + size_t **sa ; // suffix array chunks + size_t *saChunkSize ; // actual size + size_t *saChunkCapacity ; // the memory capacity + + chunkThreadArgs = new struct _FMBuilderChunkThreadArg[param.threadCnt] ; + for (i = 0 ; i < param.threadCnt ; ++i) + { + chunkThreadArgs[i].tid = i ; + chunkThreadArgs[i].threadCnt = param.threadCnt ; + chunkThreadArgs[i].saGenerator = &saGenerator ; + chunkThreadArgs[i].T = &T ; + chunkThreadArgs[i].n = n ; + } + + sa = (size_t **)malloc(sizeof(sa[0]) * param.threadCnt) ; + saChunkSize = (size_t *)malloc(sizeof(saChunkSize) * param.threadCnt) ; + saChunkCapacity = (size_t *)malloc(sizeof(saChunkCapacity) * param.threadCnt) ; + saSortThreadArgs = (struct _FMBuilderSASortThreadArg*)malloc(sizeof(struct _FMBuilderSASortThreadArg) * param.threadCnt) ; + for (i = 0 ; i < param.threadCnt ; ++i) + { + sa[i] = NULL ; + saChunkSize[i] = 0 ; + saChunkCapacity[i] = 0 ; + + saSortThreadArgs[i].tid = i ; + saSortThreadArgs[i].threadCnt = param.threadCnt ; + saSortThreadArgs[i].saGenerator = &saGenerator ; + saSortThreadArgs[i].T = &T ; + saSortThreadArgs[i].n = n ; + + saSortThreadArgs[i].maxLcp = param.maxLcp ; + saSortThreadArgs[i].semiLcpGreater = param.semiLcpGreater ; + saSortThreadArgs[i].semiLcpEqual = param.semiLcpEqual ; + } + + size_t lastSA = 0 ; // record the last SA from previous batch or chunk + size_t accuChunkSizeForSort = 0 ; // accumulated chunk size + i = 0 ; + + if (param.dumpSaFp) + fwrite(&n, sizeof(size_t), 1, param.dumpSaFp) ; + + // Start the core iterations + for (i = 0 ; i < cutCnt ; i += param.threadCnt) + { + // Load positions for current batch + if (param.printLog) + Utils::PrintLog("Extract %d chunks. (%lu/%lu chunks finished)", param.threadCnt, i, cutCnt) ; + for (j = 0 ; j < param.threadCnt ; ++j) + { + chunkThreadArgs[j].from = i ; + chunkThreadArgs[j].to = (i + param.threadCnt - 1 < n ? i + param.threadCnt - 1 : n - 1) ; + pthread_create(&threads[j], &attr, PosInChunk_Thread, (void *)(chunkThreadArgs + j)) ; + //PosInChunk_Thread((void *)(chunkThreadArgs + j)) ; + } + + if (param.printLog) + Utils::PrintLog("Wait for the chunk extraction to finish.") ; + for (j = 0 ; j < param.threadCnt ; ++j) + pthread_join(threads[j], NULL) ; + + size_t chunkCnt = param.threadCnt ; + if (i + chunkCnt >= cutCnt) + chunkCnt = cutCnt - i ; + + // concatenate the pos in the chunks + for (j = 0 ; j < chunkCnt ; ++j) + { + size_t totalSize = 0 ; + for (k = 0 ; k < param.threadCnt ; ++k) + totalSize += chunkThreadArgs[k].pos[j].size() ; + saChunkSize[j] = totalSize ; + if (totalSize > saChunkCapacity[j]) + { + free(sa[j]) ; + saChunkCapacity[j] = totalSize ; + sa[j] = (size_t *)malloc(sizeof(sa[j]) * totalSize) ; + } + + totalSize = 0 ; + for (k = 0 ; k < param.threadCnt ; ++k) + { + memcpy(sa[j] + totalSize, chunkThreadArgs[k].pos[j].data(), + sizeof(sa[j][0]) * chunkThreadArgs[k].pos[j].size()) ; + totalSize += chunkThreadArgs[k].pos[j].size() ; + std::vector().swap(chunkThreadArgs[k].pos[j]) ; + } + } + + // Submit the batch of chunks to sorting + if (param.printLog) + Utils::PrintLog("Submit %d chunks.", chunkCnt) ; + for (j = 0 ; j < chunkCnt ; ++j) + { + if (param.printLog) + Utils::PrintLog("Chunk %d elements: %llu", j, saChunkSize[j]) ; + saSortThreadArgs[j].sa = sa[j] ; + saSortThreadArgs[j].saSize = saChunkSize[j] ; + saSortThreadArgs[j].accuChunkSize = accuChunkSizeForSort ; + accuChunkSizeForSort += saChunkSize[j] ; + pthread_create(&threads[j], &attr, SortSA_Thread, (void *)(saSortThreadArgs + j)) ; + //SortSA_Thread( (void *)(saSortThreadArgs + j)) ; + } + + // Wait for current batch to finish + if (param.printLog) + Utils::PrintLog("Wait for the chunk sort to finish.") ; + for (j = 0 ; j < chunkCnt ; ++j) + pthread_join(threads[j], NULL) ; + + // Process the information from the chunks. + if (param.printLog) + Utils::PrintLog("Postprocess %d chunks.", chunkCnt) ; + for (j = 0 ; j < chunkCnt ; ++j) + { + size_t l ; + size_t size = saSortThreadArgs[j].saSize ; + size_t *saChunk = saSortThreadArgs[j].sa ; + + // Fill FM string + //printf("%d %d %d %d\n", size, j, saSortThreadArgs[prevPosTag][j].pos->at(1), + // saChunk[0]) ; + for (l = 0 ; l < size ; ++l) + { + if (saChunk[l] == 0) + { + firstISA = bwtFilled ; + BWT.Write(bwtFilled, T.Read(n - 1)) ; + } + else + BWT.Write(bwtFilled, T.Read( saChunk[l] - 1 ) ) ; + + if (param.sampledSA != NULL && bwtFilled % param.sampleRate == 0) + param.sampledSA[bwtFilled / param.sampleRate] = saChunk[l] ; + + if (param.precomputedRange != NULL) + { + int width = param.precomputeWidth ; + WORD w = 0 ;// word + if (saChunk[l] + width <= n) + { + w = T.PackRead(saChunk[l], width) ; + if (param.precomputedRange[w].second == 0) + param.precomputedRange[w].first = bwtFilled ; + ++param.precomputedRange[w].second ; + } + /*else // ignore the case near the end of the string + { + w = T.PackRead(saChunk[l], n - saChunk[l]) ; + //size_t used = n - saChunk[l] ; + //w = (T.PackRead(0, w - used)) << used | w + }*/ + + } + + if (param.selectedISA.size() != 0 ) + { + if (param.selectedISA.find(saChunk[l]) != param.selectedISA.end()) + param.selectedISA[saChunk[l]] = bwtFilled ; + } + + ++bwtFilled ; + } + + if (param.maxLcp > 0) + { + size_t offseti = bwtFilled - size ; // equiavlent to accuChunkSize + if (i > 0 || j > 0) // ignore the very first SA in the whole array + SetSemiLcpBit(T, n, saChunk[0], lastSA, offseti, param.maxLcp, + param.semiLcpGreater, param.semiLcpEqual) ; + } + + // the last element from previous chunk. + lastSA = saChunk[size - 1] ; + + if (param.dumpSaFp) + fwrite(saChunk, sizeof(saChunk[0]), size, param.dumpSaFp) ; + } + } // end of the main while loop for populating BWTs + + // Fill in the selectedSA + for (std::map::iterator iter = param.selectedISA.begin() ; + iter != param.selectedISA.end(); ++iter) + { + param.selectedSA[iter->second] = iter->first ; + } + std::map().swap(param.selectedISA) ; // ISA will not be useful + + free(threads) ; + pthread_attr_destroy(&attr) ; + delete[] chunkThreadArgs ; + for (j = 0 ; j < param.threadCnt ; ++j) + { + if (sa[j] != NULL) + { + free(sa[j]) ; + } + } + free(sa) ; + free(saChunkSize) ; + free(saChunkCapacity) ; + free(saSortThreadArgs) ; + } +} ; +} + +#endif diff --git a/compactds/FMIndex.hpp b/compactds/FMIndex.hpp new file mode 100644 index 0000000..b306dae --- /dev/null +++ b/compactds/FMIndex.hpp @@ -0,0 +1,491 @@ +#ifndef _MOURISL_COMPACTDS_FM_INDEX +#define _MOURISL_COMPACTDS_FM_INDEX + +#include + +#include "Alphabet.hpp" +#include "FixedSizeElemArray.hpp" +#include "FMBuilder.hpp" + +// Auxiliary data, other than the BWT and F (alphabet partial sum), for FM index +// Should be directly initalized through FMBuilderParam, simplifies the parameter passing +namespace compactds { +struct _FMIndexAuxData +{ + size_t n ; // the length of the text + + int sampleStrategy ; + int sampleRate ; + size_t sampleSize ; + FixedSizeElemArray sampledSA ; + + // precomputedRange: the BWT range for a prefix of size param.precomputeWidth + // The pair format is (the start position, and the length of the range). + // The advantage is that we can easily tell whether a range is empty. + size_t precomputeWidth ; + size_t precomputeSize ; + std::pair *precomputedRange ; + + size_t maxLcp ; // only consider LCP up to this point + WORD *semiLcpGreater ; // The LCP is between current suffix and its previous one + WORD *semiLcpEqual ; + + size_t adjustedSA0 ; + std::map selectedSA ; // SAs for speical purposes: e.g. boundary of genomes + WORD *selectedSAFilter ; // Quick test whether a SA could be selectedSA + int selectedSAFilterSampleRate ; + + bool printLog ; + + _FMIndexAuxData() + { + sampleStrategy = 0 ; + sampleRate = 0 ; + sampleSize = 0 ; + precomputeWidth = 0 ; + precomputeSize = 0 ; + precomputedRange = NULL ; + + maxLcp = 0 ; + semiLcpGreater = NULL ; + semiLcpEqual = NULL ; + + adjustedSA0 = 0 ; + selectedSAFilter = NULL ; + selectedSAFilterSampleRate = 1024 ; + + printLog = true ; + } + + ~_FMIndexAuxData() + { + // NOTE: has to be explicitly called through Free to release the memory. + } ; + + void Free() + { + sampledSA.Free() ; + + if (precomputedRange) + { + free(precomputedRange) ; + precomputedRange = NULL ; + } + + if (semiLcpGreater) + { + free(semiLcpGreater) ; + free(semiLcpEqual) ; + semiLcpGreater = NULL ; + semiLcpEqual = NULL ; + } + + if (selectedSA.size() > 0) + { + selectedSA.clear() ; + free(selectedSAFilter) ; + } + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, n) ; + SAVE_VAR(fp, sampleStrategy) ; + SAVE_VAR(fp, sampleRate) ; + SAVE_VAR(fp, sampleSize) ; + SAVE_VAR(fp, precomputeWidth) ; + SAVE_VAR(fp, precomputeSize) ; + SAVE_VAR(fp, adjustedSA0) ; + + sampledSA.Save(fp) ; + SAVE_ARR(fp, precomputedRange, precomputeSize) ; + + SAVE_VAR(fp, maxLcp) ; + if (maxLcp > 0) + { + fwrite(semiLcpGreater, sizeof(*semiLcpGreater), Utils::BitsToWords(n), fp) ; + fwrite(semiLcpEqual, sizeof(*semiLcpEqual), Utils::BitsToWords(n), fp) ; + } + + // For speical SAs + size_t tmpSize = selectedSA.size() ; + SAVE_VAR(fp, tmpSize) ; + SAVE_VAR(fp, selectedSAFilterSampleRate) ; + for (std::map::iterator iter = selectedSA.begin() ; + iter != selectedSA.end() ; ++iter) + { + size_t pair[2] = {iter->first, iter->second} ; + fwrite(pair, sizeof(size_t), 2, fp) ; + } + } + + void Load(FILE *fp) + { + Free() ; + size_t i ; + + LOAD_VAR(fp, n) ; + LOAD_VAR(fp, sampleStrategy) ; + LOAD_VAR(fp, sampleRate) ; + LOAD_VAR(fp, sampleSize) ; + LOAD_VAR(fp, precomputeWidth) ; + LOAD_VAR(fp, precomputeSize) ; + LOAD_VAR(fp, adjustedSA0) ; + + sampledSA.Load(fp) ; + precomputedRange = (std::pair *)malloc( + sizeof(std::pair) * precomputeSize) ; + LOAD_ARR(fp, precomputedRange, precomputeSize) ; + + LOAD_VAR(fp, maxLcp) ; + if (maxLcp > 0) + { + semiLcpGreater = Utils::MallocByBits(n) ; + semiLcpEqual = Utils::MallocByBits(n) ; + fread(semiLcpGreater, sizeof(*semiLcpGreater), Utils::BitsToWords(n), fp) ; + fread(semiLcpEqual, sizeof(*semiLcpEqual), Utils::BitsToWords(n), fp) ; + } + + size_t tmpSize = 0 ; + LOAD_VAR(fp, tmpSize) ; + LOAD_VAR(fp, selectedSAFilterSampleRate) ; + if (tmpSize > 0) + { + selectedSAFilter = Utils::MallocByBits(DIV_CEIL(n, selectedSAFilterSampleRate)) ; + for (i = 0 ; i < tmpSize ; ++i) + { + size_t pair[2] ; + fread(pair, sizeof(size_t), 2, fp) ; + selectedSA[pair[0]] = pair[1] ; + Utils::BitSet(selectedSAFilter, pair[0] / selectedSAFilterSampleRate) ; + } + } + } +} ; + +template +class FMIndex +{ +private: + SeqClass _BWT ; + size_t _n ; + Alphabet _alphabets ; // May handle more complex mapping, e.g. Huffman coding + Alphabet _plainAlphabetCoder ; // for plain mapping, important for partial sum access + size_t *_plainAlphabetPartialSum ; + size_t _plainAlphabetBits ; // Needed for coding index accessing precomputedRange + size_t _firstISA ; // ISA[0] + ALPHABET _lastChr ; // last character in the original text + + // @return: whether SA[i] information is stored + // the SA information is returned through the reference sa + bool GetSampledSA(size_t i, size_t &sa) + { + if (i == _firstISA) + { + sa = _auxData.adjustedSA0 ; + return true ; + } + else if (i % _auxData.sampleRate == 0) + { + sa = _auxData.sampledSA[i / _auxData.sampleRate] ; + return true ; + } + else if (_auxData.selectedSAFilter) + { + if (Utils::BitRead(_auxData.selectedSAFilter, i / _auxData.selectedSAFilterSampleRate) + && (_auxData.selectedSA.find(i) != _auxData.selectedSA.end())) + { + sa = _auxData.selectedSA[i] ; + return true ; + } + } + + return false ; + } +public: + struct _FMIndexAuxData _auxData ; // the data used for locate operation + + FMIndex() + { + _n = 0 ; + } + + ~FMIndex() + { + Free() ; + } + + void SetAlphabetCode(const Alphabet &a) + { + _alphabets = a ; + } + + void Free() + { + if (_n > 0) + { + _n = 0 ; + free(_plainAlphabetPartialSum) ; + _auxData.Free() ; + } + } + + void InitAuxData(struct _FMBuilderParam &builderParam) + { + _auxData.n = builderParam.n ; + + _auxData.sampleRate = builderParam.sampleRate ; + _auxData.sampleSize = builderParam.sampleSize ; + _auxData.sampleStrategy = builderParam.sampleStrategy ; + //_auxData.sampledSA = builderParam.sampledSA ; + _auxData.sampledSA.InitFromArray(0, builderParam.sampledSA, _auxData.sampleSize) ; + free(builderParam.sampledSA) ; + + _auxData.precomputeWidth = builderParam.precomputeWidth ; + _auxData.precomputeSize = builderParam.precomputeSize ; + _auxData.precomputedRange = builderParam.precomputedRange ; + + _auxData.maxLcp = builderParam.maxLcp ; + _auxData.semiLcpGreater = builderParam.semiLcpGreater ; + _auxData.semiLcpEqual = builderParam.semiLcpEqual ; + + _auxData.adjustedSA0 = builderParam.adjustedSA0 ; + + if (builderParam.selectedSA.size() > 0) + { + _auxData.selectedSAFilter = Utils::MallocByBits(DIV_CEIL(_auxData.n, + _auxData.selectedSAFilterSampleRate)) ; + + _auxData.selectedSA = builderParam.selectedSA ; + for (std::map::iterator iter = _auxData.selectedSA.begin() ; + iter != _auxData.selectedSA.end(); ++iter) + { + Utils::BitSet(_auxData.selectedSAFilter, + iter->first / _auxData.selectedSAFilterSampleRate) ; + } + } + } + + void Init(FixedSizeElemArray &BWT, size_t n, + size_t firstISA, struct _FMBuilderParam& builderParam, + const ALPHABET *alphabetMapping, int alphabetSize) + { + size_t i ; + + _plainAlphabetCoder.InitFromList(alphabetMapping, alphabetSize) ; // The input BWT string should be also plain coded in the same fashion + _plainAlphabetBits = Utils::Log2Ceil(alphabetSize) ; + + if (_alphabets.GetSize() == 0) + _alphabets.InitFromList(alphabetMapping, alphabetSize) ; + + // Auxiliary data structures + _n = n ; + _firstISA = firstISA ; + _lastChr = alphabetMapping[ BWT.Read(firstISA) ] ; + InitAuxData(builderParam) ; + + // L list + _BWT.SetAlphabet(_alphabets) ; + _BWT.Init(BWT, n, alphabetMapping) ; + if (_auxData.printLog) + _BWT.PrintStats() ; + + // F list + _plainAlphabetPartialSum = (size_t *)calloc(alphabetSize + 1, + sizeof(*_plainAlphabetPartialSum)) ; + for (i = 0 ; i < n ; ++i) + { + ++_plainAlphabetPartialSum[BWT.Read(i)] ; + } + for (i = 1 ; i < alphabetSize ; ++i) + _plainAlphabetPartialSum[i] += _plainAlphabetPartialSum[i - 1] ; + for (i = alphabetSize ; i >= 1 ; --i) + _plainAlphabetPartialSum[i] = _plainAlphabetPartialSum[i - 1] ; + _plainAlphabetPartialSum[0] = 0 ; + } + + size_t Rank(ALPHABET c, size_t p, int inclusive = 1) + { + size_t ret = _BWT.Rank(c, p, inclusive) ; + // Since we do not use $, the last character in the original string + // will be moved to the _firstISA instead of the first position + // We need to move this back + // Potential future refactoring: appending an A to the end of the string + if (c == _lastChr && (p < _firstISA || (!inclusive && p == _firstISA))) + ++ret ; + return ret ; + } + + void BackwardExtend(ALPHABET c, size_t sp, size_t ep, + size_t &nextSp, size_t &nextEp) + { + size_t offset = _plainAlphabetPartialSum[ _plainAlphabetCoder.Encode(c) ] ; + //printf("%c: %d %d %d. %d %d\n", c, offset, sp, ep, _BWT.Rank(c, sp, 0), + // _BWT.Rank(c, ep)) ; + // Need minus 1 here because the return of Rank is 1-based. + nextSp = offset + Rank(c, sp, /*inclusive=*/0) + 1 - 1 ; + + // TODO: Fix a potential issue of underflow. + // Now it is handled by out side + if (sp != ep) + nextEp = offset + Rank(c, ep) - 1 ; + else + nextEp = nextSp + ((_BWT.Access(ep) == c) ? 0 : -1) ; + } + + // This one is essentially LF mapping + size_t BackwardExtend(ALPHABET c, size_t p) + { + size_t offset = _plainAlphabetPartialSum[ _plainAlphabetCoder.Encode(c) ] ; + return offset + Rank(c, p) - 1 ; + } + + // m - length of s + // Return the [sp, ep] through the option, and the length of matched prefix in size_t + size_t BackwardSearch(char *s, size_t m, size_t &sp, size_t &ep) + { + size_t i ; + if (m < _auxData.precomputeWidth) + return 0 ; + + if (_auxData.precomputeWidth > 0) + { + WORD initW = 0 ; + for (i = 0 ; i < _auxData.precomputeWidth ; ++i) + { + if (!_alphabets.IsIn(s[m - 1 - i])) + { + sp = 1 ; + ep = 0 ; + return i ; + } + initW = (initW << _plainAlphabetBits) | (_plainAlphabetCoder.Encode(s[m - 1 - i])) ; + } + + if (_auxData.precomputedRange[initW].second == 0) + { + sp = 1 ; + ep = 0 ; + return _auxData.precomputeWidth - 1 ; + } + sp = _auxData.precomputedRange[initW].first ; + ep = sp + _auxData.precomputedRange[initW].second - 1 ; + } + else + { + sp = 0 ; + ep = _n - 1 ; + } + + size_t l = _auxData.precomputeWidth ; + size_t nextSp = sp ; + size_t nextEp = ep ; + while (l < m) + { + if (!_alphabets.IsIn(s[m - 1 - l])) + break ; + BackwardExtend(s[m - 1 - l], sp, ep, nextSp, nextEp) ; + if ( nextSp > nextEp || nextEp > _n) + break ; + sp = nextSp ; + ep = nextEp ; + ++l ; + } + return l ; + } + + // @return: the value of the sampled SA for BWT[i] + // l is the offset between + size_t BackwardToSampledSA(size_t i, size_t &l) + { + l = 0 ; + size_t ret = 0 ; + while (!GetSampledSA(i, ret)) + { + i = BackwardExtend( _BWT.Access(i), i) ; + ++l ; + } + return ret ; + } + + // return ISA[n - 1] + size_t GetLastISA() + { + return _plainAlphabetPartialSum[ _plainAlphabetCoder.Encode(_lastChr) ] ; + } + + // Calculate the values for SA[sp..ep] + void LocateRange(size_t sp, size_t ep, bool withOffset, std::vector &locatedSA) + { + size_t i ; + locatedSA.clear() ; + for (i = sp ; i <= ep ; ++i) + { + size_t l ; + size_t sa = BackwardToSampledSA(i, l) ; + if (withOffset) + locatedSA.push_back(sa + l) ; + else + locatedSA.push_back(sa) ; + } + } + + size_t GetSize() + { + return _n ; + } + + size_t GetAlphabetSize() + { + return _alphabets.GetSize() ; + } + + void PrintSpace() + { + Utils::PrintLog("FM-index space usage (bytes):") ; + Utils::PrintLog("BWT: %llu", _BWT.GetSpace()) ; + Utils::PrintLog("sampledSA: %llu", _auxData.sampledSA.GetSpace()) ; + Utils::PrintLog("precomputedRange: %llu", _auxData.precomputeSize * sizeof(*_auxData.precomputedRange)) ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _plainAlphabetBits) ; + SAVE_VAR(fp, _firstISA) ; + SAVE_VAR(fp, _lastChr) ; + + _BWT.Save(fp) ; + + _alphabets.Save(fp) ; + _plainAlphabetCoder.Save(fp) ; + size_t alphabetSize = _plainAlphabetCoder.GetSize() ; + SAVE_ARR(fp, _plainAlphabetPartialSum, alphabetSize + 1) ; + + _auxData.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _plainAlphabetBits) ; + LOAD_VAR(fp, _firstISA) ; + LOAD_VAR(fp, _lastChr) ; + + _BWT.Load(fp) ; + + _alphabets.Load(fp) ; + _plainAlphabetCoder.Load(fp) ; + size_t alphabetSize = _plainAlphabetCoder.GetSize() ; + _plainAlphabetPartialSum = (size_t *)calloc(alphabetSize + 1, + sizeof(*_plainAlphabetPartialSum)) ; + LOAD_ARR(fp, _plainAlphabetPartialSum, alphabetSize + 1) ; + + _auxData.Load(fp) ; + } +} ; +} + +#endif diff --git a/compactds/FixedSizeElemArray.hpp b/compactds/FixedSizeElemArray.hpp new file mode 100644 index 0000000..4591774 --- /dev/null +++ b/compactds/FixedSizeElemArray.hpp @@ -0,0 +1,322 @@ +#ifndef _MOURISL_COMPACTDS_FIXEDSIZEELEM_ARRAY +#define _MOURISL_COMPACTDS_FIXEDSIZEELEM_ARRAY + +#include +#include + +#include + +#include "Utils.hpp" + +/* + * The class for the array where each element is of fixed size + * We use a word size w = 64bit to maximize the chance of within word access + * Externally the index is continuous, but interally they are segmented by the word as the right of Fig 3.3 + */ + +namespace compactds { +class FixedSizeElemArray +{ +private: + WORD *_W ; + size_t _size ; // memory size in word + int _l ; + size_t _n ; +public: + FixedSizeElemArray() + { + _W = NULL ; + _size = 0 ; + _n = 0 ; + _l = 0 ; + } + + ~FixedSizeElemArray() + { + Free() ; + } + + // Allocate the memory for _n elements, where each elements takes _l bits + void Malloc(int l, size_t n) + { + Free() ; + this->_n = n ; + this->_l = l ; + _size = Utils::BitsToWords(l * n) ; + _W = Utils::MallocByBits(l * n) ; + } + + // _l - number of bits for each element. <=0: automatically decide + // in - input array + // n - the length of input array + void InitFromArray(int l, const unsigned int *in, const size_t &n) + { + size_t i ; + if (l <= 0) + { + // We determine the best fixed size + l = 1 ; + for (i = 0 ; i < n ; ++i) + { + int bitCounts = Utils::CountBits(in[i]) ; + if (bitCounts > l) + l = bitCounts ; + } + } + + Malloc(l, n) ; + for (i = 0 ; i < n ; ++i) + Write(i, in[i]) ; + } + + void InitFromArray(int l, const size_t *in, const size_t &n) + { + size_t i ; + if (l <= 0) + { + // We determine the best fixed size + l = 1 ; + for (i = 0 ; i < n ; ++i) + { + int bitCounts = Utils::CountBits(in[i]) ; + if (bitCounts > l) + l = bitCounts ; + } + } + + Malloc(l, n) ; + for (i = 0 ; i < n ; ++i) + Write(i, in[i]) ; + } + + void Free() + { + if (_W != NULL) + free(_W) ; + _W = NULL ; + _n = _l = 0 ; + } + + // Get the i-th element + uint64_t Read(size_t i) const + { + return Utils::BitsRead(_W, i * _l, (i + 1)* _l - 1) ; + } + + uint64_t operator[](size_t i) const + { + return Read(i) ; + } + + void Write(size_t i, int x) + { + Utils::BitsWrite(_W, i * _l, (i + 1) * _l - 1, x) ; + } + + /*uint64_t Read64(size_t i) const + { + return Utils::BitsRead(_W, i * _l, (i + 1)* _l - 1) ; + }*/ + + void Write64(size_t i, uint64_t x) + { + Utils::BitsWrite(_W, i * _l, (i + 1) * _l - 1, x) ; + } + + size_t GetSpace() const + { + return sizeof(_W[0]) * _size + sizeof(*this) ; + } + + int GetElemLength() const + { + return _l ; + } + + void SetElemLength(int l) + { + _l = l ; + } + + size_t GetSize() const + { + return _n ; + } + + // Assume we don't need to change the memory size + void SetSize(size_t n) + { + _n = n ; + } + + const WORD* GetData() const + { + return _W ; + } + + // Return num elements starting from i. + // @return: bit packed _W[i].._W[i + num - 1] + WORD PackRead(size_t i, size_t num) const + { + return Utils::BitsRead(_W, i * _l, (i + num) * _l - 1) ; + } + + WORD PackReadRev(size_t i, size_t num) const + { + size_t j ; + WORD ret = 0 ; + for (j = 0 ; j < num ; ++j) + ret = (ret << _l) + Read(i + j) ; + return ret ; + } + + // Find the length of the matching prefix between A[s..e] and B[s..e] + // assumes _l is the same + // If all match, return min(e-s+1, eb-sb+1) + size_t PrefixMatchLen(size_t s, size_t e, const FixedSizeElemArray &B, size_t sb, size_t eb) const + { + if (e >= _n) + e = _n - 1 ; + if (eb >= B._n) + eb = B._n - 1 ; + size_t ai ; + size_t bi ; + + int block = WORDBITS / _l ; + ai = s ; + bi = sb ; + int len = MIN(e-s+1, eb-sb+1) ; + if (len < block) + block = len ; + if (block > 1) + { + for ( ; ai + block - 1 <= e && bi + block - 1 <= eb ; + ai += block, bi += block) + { + WORD wa = PackRead(ai, block) ; + WORD wb = B.PackRead(bi, block) ; + if (wa == wb) + continue ; + + int k ; + for (k = 0 ; k < block ; ++k) + { + WORD smalla = (wa >> (k * _l)) & MASK(_l) ; + WORD smallb = (wb >> (k * _l)) & MASK(_l) ; + if (smalla != smallb) + return ai + k - s ; + } + } + } + + for ( ; ai <= e && bi <= eb ; ++ai, ++bi) + { + WORD smalla = Read(ai) ; + WORD smallb = B.Read(bi) ; + if (smalla != smallb) + return ai - s ; + } + + return MIN(e - s + 1, eb - sb + 1) ; + } + + // Compare A[s..e] and B[sb..eb] + // @return: sign(A-B) + int SubrangeCompare(size_t s, size_t e, const FixedSizeElemArray &B, size_t sb, size_t eb) const + { + if (_l != B._l) + return _l - B._l ; + if (e >= _n) + e = _n - 1 ; + if (eb >= B._n) + eb = B._n - 1 ; + size_t matchCnt = PrefixMatchLen(s, e, B, sb, eb) ; + + if (matchCnt == MIN(e - s + 1, eb - sb + 1)) + { + if (e - s + 1 == eb - sb + 1) + return 0 ; + else if (e - s + 1 < eb - sb + 1) + return -1 ; + else + return 1 ; + } + else + { + WORD smalla = Read(s + matchCnt) ; + WORD smallb = B.Read(sb + matchCnt) ; + + if (smalla < smallb) + return -1 ; + else // they have to be different at this point + return 1 ; + } + } + + // Malloc by copying the first p element of B + void InitFromOtherPrefix(const FixedSizeElemArray &B, size_t p) + { + Malloc(B._l, p) ; + size_t wordBytes = Utils::BitsToWordBytes(_n * _l) ; + memcpy(_W, B._W, wordBytes) ; + } + + void Resize(size_t newn) + { + _n = newn ; + _size = Utils::BitsToWords(_l * newn) ; + _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ; + } + + // Reserve the space for m elements without changing current element + void Reserve(size_t m) + { + if (m <= _n) + return; + + _size = Utils::BitsToWords(_l * m) ; + if (_W != NULL) + _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ; + else + _W = Utils::MallocByBits(_l * m) ; + } + + // push back another element to the end of the array. + // This function also handles expand the array + void PushBack(int x) + { + if (Utils::BitsToWords(_l * _n) == _size) + Reserve(2 * _n) ; + Write(_n, x); + ++_n ; + } + + void Print(FILE *fp, char sep = ' ') const + { + size_t i ; + for (i = 0 ; i < _n ; ++i) + fprintf(fp, "%d%c", (int)Read(i), sep) ; + fprintf(fp, "\n") ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _size) ; + SAVE_VAR(fp, _l) ; + SAVE_VAR(fp, _n) ; + fwrite(_W, sizeof(_W[0]), Utils::BitsToWords(_n * _l), fp) ; + } + + void Load(FILE *fp) + { + Free() ; + LOAD_VAR(fp, _size) ; + LOAD_VAR(fp, _l) ; + LOAD_VAR(fp, _n) ; + _W = Utils::MallocByBits(WORDBITS * _size) ; + fread(_W, sizeof(_W[0]), Utils::BitsToWords(_n * _l), fp) ; + } +} ; +} + +#endif diff --git a/compactds/FractionBitElemArray.hpp b/compactds/FractionBitElemArray.hpp new file mode 100644 index 0000000..6a1d68f --- /dev/null +++ b/compactds/FractionBitElemArray.hpp @@ -0,0 +1,118 @@ +#ifndef _MOURISL_COMPACTDS_FRACTIONBITELEM_ARRAY +#define _MOURISL_COMPACTDS_FRACTIONBITELEM_ARRAY + +#include +#include + +#include + +#include "Utils.hpp" + +/* + * The class for the array where each element is in the range of [0..d-1] and _d is far from the power of 2 + * The idea is that each WORD is a d-ary number (Section 3.1) + */ + +namespace compactds { +class FractionBitElemArray +{ +private: + WORD *_W ; + const int _w ; + size_t _size ; + size_t _d ; // element is in the range of [0..d-1] + size_t _n ; + int _k ; // number of elements per word +public: + FractionBitElemArray():_w(8 * sizeof(WORD)) + { + _W = NULL ; + } + + ~FractionBitElemArray() + { + Free() ; + } + + // Allocate the memory for _n elements, where each element is in the range of [0..d-1] + void Malloc(size_t d, size_t n) + { + this->_n = n ; + this->_d = d ; + _k = (int)(_w / ((double)log((double)_d) / (double)log(2.0))) ; + _size = DIV_CEIL(n, _k) ; + _W = Utils::MallocByBits(_size * WORDBITS) ; + } + + // in - input array + // n - the length of input array + void InitFromArray(size_t d, const unsigned int *in, const size_t &n) + { + size_t i ; + if (d == 0) + { + // We determine the best fixed size + d = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (in[i] > d) + d = in[i] ; + } + ++d ; + } + + Malloc(d, n) ; + for (i = 0 ; i < _n ; ++i) + Write(i, in[i]) ; + } + + void Free() + { + if (_W != NULL) + free(_W) ; + _W = NULL ; + } + + // Get the i-th element + unsigned Read(size_t i) const + { + return (_W[i/_k] / Utils::PowerInt(_d, i%_k)) % _d ; + } + + void Write(size_t i, int x) + { + size_t j = i / _k ; + size_t p = Utils::PowerInt(_d, i%_k) ; + _W[j] = _W[j] - ((_W[j] / p) %_d) * p + x * p ; + } + + size_t GetSpace() const + { + return sizeof(_W[0]) * _size + sizeof(*this) ; + } + + int GetElemRange() const + { + return _d ; + } + + size_t GetSize() const + { + return _n ; + } + + const WORD* GetData() const + { + return _W ; + } + + void Resize(size_t newn) + { + _n = newn ; + _size = Utils::BitsToWords(DIV_CEIL(_n, _k)) ; + _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ; + } +} ; +} + +#endif diff --git a/compactds/HuffmanCode.hpp b/compactds/HuffmanCode.hpp new file mode 100644 index 0000000..856d462 --- /dev/null +++ b/compactds/HuffmanCode.hpp @@ -0,0 +1,230 @@ +#ifndef _MOURISL_COMPACTDS_HUFFMANCODE +#define _MOURISL_COMPACTDS_HUFFMANCODE + +#include + +#include "Utils.hpp" + +namespace compactds { +struct _huffman_node +{ + int symbol ; + uint64_t freq ; + int next ; // used in _tree construction as a linked list + int left, right ; // Left, right children + bool operator <(const struct _huffman_node &b) const + { + return freq < b.freq ; + } +} ; + +class HuffmanCode +{ +private: + WORD *_codes ; // assume alphabet set is in [0, n-1]. + int *_codeLens ; + size_t _n ; // the size of the alphabet + struct _huffman_node *_tree ; + size_t _space ; + + // Algorithm 2.2: building a huffman _tree with linked list instead of heap + void BuildTree(struct _huffman_node *elems, size_t n) + { + std::sort(elems, elems + n) ; + + size_t i ; + size_t nodeCnt ; + + _tree = (struct _huffman_node *)malloc(sizeof(*_tree) * (2 * n - 1)) ; + _space += (sizeof(*_tree) * (2 * n - 1)) ; + + for (i = 0 ; i < n ; ++i) + { + _tree[i] = elems[i] ; + if (i + 1 < n) + _tree[i].next = i + 1 ; + else + _tree[i].next = -1 ; + _tree[i].left = _tree[i].right = -1 ; + } + size_t minTag = 0 ; // minTag and minTag+1 is the availble two nodes with minimum + size_t insertTag = 0 ; // the start position to search the next insert node + // this marker is the key for linear time building the _tree after sorting. + nodeCnt = n ; + int p ; + while (1) + { + int a = minTag ; + int b = _tree[minTag].next ; + if (b == -1) + break ; + _tree[nodeCnt].symbol = -1 ; + _tree[nodeCnt].freq = _tree[a].freq + _tree[b].freq ; + _tree[nodeCnt].left = a ; + _tree[nodeCnt].right = b ; + + // Search for the appropriate position to insert the new element + p = insertTag ; + while (_tree[p].next != -1 && _tree[ _tree[p].next ].freq <= _tree[nodeCnt].freq) + p = _tree[p].next ; + + _tree[nodeCnt].next = _tree[p].next ; + _tree[p].next = nodeCnt ; + + insertTag = nodeCnt ; + ++nodeCnt ; + minTag = _tree[b].next ; + } + } + + // Recurisvely traverse the Huffman _tree to put the code + void CreateCodes(int tag, WORD c, int l) + { + if (_tree[tag].left == -1 && _tree[tag].right == -1) + { + _codes[_tree[tag].symbol] = c ; + _codeLens[_tree[tag].symbol] = l ; + return ; + } + + CreateCodes(_tree[tag].left, c<<1, l + 1) ; + CreateCodes(_tree[tag].right, (c<<1) + 1, l + 1) ; + } + + void InternalInit(struct _huffman_node *elems, size_t n) + { + this->_n = n ; + _space = 0 ; + + BuildTree(elems, n) ; + _codes = (WORD *)malloc(sizeof(*_codes) * n) ; + _codeLens = (int *)malloc(sizeof(*_codeLens) * n) ; + CreateCodes(2*n - 2, 0, 0) ; + } + +public: + HuffmanCode() + { + _n = _space = 0 ; + _codes = NULL ; + _codeLens = NULL ; + _tree = NULL ; + } + ~HuffmanCode() {Free();} + + void Free() + { + _n = _space = 0 ; + if (_codes != NULL) + { + free(_codes) ; free(_codeLens) ; free(_tree) ; + _codes = NULL ; + _codeLens = NULL ; + _tree = NULL ; + } + } + + size_t GetSpace() + { + return _space + sizeof(*this) ; + } + + int GetSize() + { + return _n ; + } + + struct _huffman_node *GetTree() const + { + return _tree ; + } + + size_t GetRoot() const + { + return 2 * _n - 2 ; + } + + HuffmanCode &operator =(const HuffmanCode &in) + { + Free() ; + + if (in._n == 0) + return *this; + _n = in._n ; + _space = in._space ; + + _codes = (WORD *)malloc(sizeof(*_codes) * _n) ; + _codeLens = (int *)malloc(sizeof(*_codeLens) * _n) ; + _tree = (struct _huffman_node *)malloc(sizeof(*_tree) * (2*_n-1)) ; + + memcpy(_codes, in._codes, sizeof(*_codes) * _n) ; + memcpy(_codeLens, in._codeLens, sizeof(*_codeLens) * _n) ; + memcpy(_tree, in._tree, sizeof(*_tree)) ; + + return *this ; + } + + void InitFromFrequency(const uint64_t *freq, const size_t n) + { + size_t i ; + struct _huffman_node *elems = (struct _huffman_node*)malloc(sizeof(*elems) * n); + + for (i = 0 ; i < n ; ++i) + { + elems[i].symbol = i ; + elems[i].freq = freq[i] ; + } + InternalInit(elems, n) ; + + free(elems) ; + } + + int GetDepth(int tag) + { + if (_tree[tag].left == -1) + return 0 ; + int ldepth = GetDepth(_tree[tag].left) ; + int rdepth = GetDepth(_tree[tag].right) ; + return 1 + (ldepth > rdepth ? ldepth : rdepth) ; + } + + WORD Encode(int x, int &l) const + { + l = _codeLens[x] ; + return _codes[x] ; + } + + int Decode(WORD c, int l) const + { + int i ; + int p = 2 * _n - 2 ; // root + for (i = 0 ; i < l ; ++i) + { + if ((c >> (l - i - 1)) & 1) + p = _tree[p].right ; + else + p = _tree[p].left ; + } + return _tree[p].symbol ; + } + + void Save(FILE *fp) + { + fwrite(this, sizeof(this), 1, fp) ; + fwrite(_tree, sizeof(_tree[0]), 2 * _n - 1, fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + fread(this, sizeof(this), 1, fp) ; + _codes = (WORD *)malloc(sizeof(*_codes) * _n) ; + _codeLens = (int *)malloc(sizeof(*_codeLens) * _n) ; + _tree = (struct _huffman_node *)malloc(sizeof(*_tree) * (2*_n-1)) ; + fwrite(_tree, sizeof(_tree[0]), 2 * _n - 1, fp) ; + CreateCodes(2*_n - 2, 0, 0) ; + } +} ; +} +#endif diff --git a/compactds/InterleavedFixedSizeElemArray.hpp b/compactds/InterleavedFixedSizeElemArray.hpp new file mode 100644 index 0000000..2504def --- /dev/null +++ b/compactds/InterleavedFixedSizeElemArray.hpp @@ -0,0 +1,238 @@ +#ifndef _MOURISL_COMPACTDS_INTERLEAVEDFIXEDSIZEELEM_ARRAY +#define _MOURISL_COMPACTDS_INTERLEAVEDFIXEDSIZEELEM_ARRAY + +// The class handles two levels of arrays. +// Also a class where the first level is 64bit. + +#include "Utils.hpp" + +namespace compactds { +class InterleavedFixedSizeElemArray +{ +private: + size_t _l0, _l1 ; // length of element 0 and 1 + size_t _n0 ; + size_t _f1 ; // frequency of element 1 after each element 0 + size_t _size ; //memory size, in words + WORD *_W ; +public: + InterleavedFixedSizeElemArray() + { + _W = NULL ; + _size = 0 ; + _n0 = _f1 = 0 ; + _l0 = _l1 = 0 ; + } + + ~InterleavedFixedSizeElemArray() + { + Free() ; + } + + void Free() + { + if (_n0 > 0) + { + free(_W) ; + _W = NULL ; + _size = 0 ; + _n0 = _f1 = 0 ; + _l0 = _l1 = 0 ; + } + } + + size_t GetSpace() const + { + return sizeof(_W[0]) * _size + sizeof(*this) ; + } + + void Malloc(size_t l0, size_t n0, int l1, size_t f1) + { + Free() ; + + _l0 = l0 ; + _n0 = n0 ; + _l1 = l1 ; + _f1 = f1 ; + _size = Utils::BitsToWords(l0 * n0 + l1 * n0 * f1) ; + _W = (WORD *)malloc(_size * sizeof(WORD)) ; + } + + void Resize(size_t newn1) + { + _n0 = newn1 ; + _size = Utils::BitsToWords(_l0 * _n0 + _l1 * _n0 * _f1) ; + _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ; + } + + int GetElem0Length() const + { + return _l0 ; + } + + int GetElem1Length() const + { + return _l1 ; + } + + size_t GetSize0() const + { + return _n0 ; + } + + size_t GetSize1() const + { + return _n0 * _f1 ; + } + + void SetSize(size_t n0) + { + _n0 = n0 ; + } + + WORD Read(int type, size_t i) const + { + if (type == 0) + { + const size_t offset = i * (_l0 + _f1 * _l1) ; + return Utils::BitsRead(_W, offset, offset + _l0 - 1) ; + } + else + { + const size_t offset = (i / _f1) * (_l0 + _f1 * _l1) + _l0 + _l1 * (i%_f1); + return Utils::BitsRead(_W, offset, offset + _l1 - 1) ; + } + } + + void Write(size_t type, size_t i, int x) + { + if (type == 0) + { + const size_t offset = i * (_l0 + _f1 * _l1) ; + Utils::BitsWrite(_W, offset, offset + _l0 - 1, x) ; + } + else + { + const size_t offset = (i / _f1) * (_l0 + _f1 * _l1) + _l0 + _l1 * (i%_f1); + Utils::BitsWrite(_W, offset, offset + _l1 - 1, x) ; + } + } +} ; + +// Optimized for level 0 is 64bit integer. +// The second level will be paded +class Interleaved64FixedSizeElemArray +{ +private: + size_t _l1 ; // length of element 0 and 1 + size_t _n0 ; + size_t _f1 ; // frequency of element 1 after each element 0 + size_t _size ; //memory size + WORD *_W ; + size_t _b ; // block size for each element 0 and attached element 1, in words +public: + Interleaved64FixedSizeElemArray() + { + _W = NULL ; + _size = 0 ; + _n0 = _f1 = 0 ; + _l1 = 0 ; + } + + ~Interleaved64FixedSizeElemArray() + { + Free() ; + } + + void Free() + { + if (_n0 > 0) + { + free(_W) ; + _W = NULL ; + _size = 0 ; + _n0 = _f1 = 0 ; + _l1 = 0 ; + } + } + + size_t GetSpace() const + { + return sizeof(_W[0]) * _size + sizeof(*this) ; + } + + void Malloc(size_t n0, int l1, size_t f1) + { + Free() ; + + _n0 = n0 ; + _l1 = l1 ; + _f1 = f1 ; + _b = Utils::BitsToWords(WORDBITS + DIV_CEIL(l1 * f1, WORDBITS) * WORDBITS) ; + _size = Utils::BitsToWords(_n0 * _b * WORDBITS) ; + _W = (WORD *)malloc(_size * sizeof(WORD)) ; + } + + void Resize(size_t newn1) + { + _n0 = newn1 ; + _size = Utils::BitsToWords(_n0 * _b * WORDBITS) ; + _W = (WORD *)realloc(_W, _size * sizeof(WORD)) ; + } + + int GetElemr0Length() const + { + return 64 ; + } + + int GetElem2Length() const + { + return _l1 ; + } + + size_t GetSize1() const + { + return _n0 ; + } + + size_t GetSize2() const + { + return _n0 * _f1 ; + } + + void SetSize(size_t n0) + { + _n0 = n0 ; + } + + WORD Read0(size_t i) const + { + return _W[i * _b] ; + } + + WORD Read1(size_t i) const + { + const size_t tmp = i / _f1 ; + const size_t offset = (tmp * _b + 1)* WORDBITS + (i - tmp * _f1) * _l1 ; + return Utils::BitsRead(_W, offset, offset + _l1 - 1 ) ; + } + + void Write0(size_t i, WORD x) + { + _W[i * _b] = x ; + } + + void Write1(size_t i, int x) + { + const size_t tmp = i / _f1 ; + const size_t offset = (tmp * _b + 1)* WORDBITS + (i - tmp * _f1) * _l1 ; + Utils::BitsWrite(_W, offset, offset + _l1 - 1, x ) ; + } +} ; + + +typedef InterleavedFixedSizeElemArray ILArray ; +typedef Interleaved64FixedSizeElemArray IL64Array ; +} + +#endif diff --git a/compactds/InvertedIndex.hpp b/compactds/InvertedIndex.hpp new file mode 100644 index 0000000..35ac2e0 --- /dev/null +++ b/compactds/InvertedIndex.hpp @@ -0,0 +1,131 @@ +#ifndef _MOURISL_COMPACTDS_INVERTEDINDEX +#define _MOURISL_COMPACTDS_INVERTEDINDEX + +// Use permutation to represent inverted index + +#include "Utils.hpp" +#include "FixedSizeElemArray.hpp" +#include "Permutation.hpp" +#include "Bitvector_Plain.hpp" +#include "CompactMapper.hpp" + +namespace compactds { +class InvertedIndex +{ +private: + size_t _n ; + Permutation _pi ; + Bitvector_Plain _D ; // marker of the start position for each number/alphabet in the concatendated permutation list. + CompactMapper _map ; + size_t _space ; + +public: + InvertedIndex() + { + } + + ~InvertedIndex() + { + } + + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + void Init(const FixedSizeElemArray &list, size_t n, bool sparseMap) + { + size_t i ; + _n = n ; + + _map.Init(list, n, sparseMap) ; + + size_t *pi = (size_t *)malloc(sizeof(*pi) * _n) ; + size_t *psum = (size_t *)calloc(sizeof(*psum), _n) ; + for (i = 0 ; i < _n ; ++i) + { + ++psum[ _map.Map(list.Read(i)) ] ; + } + + size_t m = _map.GetCompactSize() ; + + _D.Malloc(_n) ; + _D.BitSet(0) ; + for (i = 1 ; i < m ; ++i) + { + psum[i] += psum[i - 1] ; + _D.BitSet(psum[i - 1]) ; + } + for (i = m - 1 ; i > 0 ; --i) + psum[i] = psum[i - 1] ; + psum[0] = 0 ; + _D.Init() ; + + for (i = 0 ; i < _n ; ++i) + { + size_t tmp = _map.Map(list.Read(i)) ; + pi[ psum[tmp] ] = i ; + ++psum[tmp] ; + } + _pi.Init(pi, n) ; + + free(pi) ; + free(psum) ; + } + + // Search the ith occurence label l (0-based) + size_t Search(size_t l, size_t i) const + { + size_t mapl = _map.Map(l) ; + return _pi.Next( _D.Select(mapl + 1) + i) ; + } + + // @return: the number of positions for label l + size_t Positions(size_t l, std::vector &pos) const + { + size_t mapl = _map.Map(l) ; + size_t i, cnt ; + if (mapl == _map.GetCompactSize() - 1) + cnt = _n - _D.Select(mapl + 1) ; + else + cnt = _D.Select(mapl + 2) - _D.Select(mapl) ; + + size_t start = _D.Select(mapl + 1) ; + for (i = 0 ; i < cnt ; ++i) + pos.push_back( _pi.Next(start + i) ) ; + + return cnt ; + } + + // Count the number of label l in the sequences + size_t Count(size_t l) const + { + size_t mapl = _map.Map(l) ; + if (mapl == _map.GetCompactSize() - 1) + return _n - _D.Select(mapl + 1) ; + else + return _D.Select(mapl + 2) - _D.Select(mapl) ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _space) ; + _pi.Save(fp) ; + _D.Save(fp) ; + _map.Save(fp) ; + } + + void Load(FILE *fp) + { + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _space) ; + _pi.Load(fp) ; + _D.Load(fp) ; + _map.Load(fp) ; + } +} ; + +} + +#endif diff --git a/compactds/Makefile b/compactds/Makefile new file mode 100644 index 0000000..65ad2ea --- /dev/null +++ b/compactds/Makefile @@ -0,0 +1,31 @@ +CXX = g++ +CXXFLAGS= -Wall -g -msse4.2 -O3 #-pg -g #-Wall #-O3 +LINKPATH= +LINKFLAGS = -lpthread -lz +DEBUG= +OBJECTS = #BaseReads.o Alignment.o +HEADERS = *.hpp + +#asan=1 +ifneq ($(asan),) + CXXFLAGS+=-fsanitize=address -g + LDFLAGS+=-fsanitize=address -ldl -g +endif + +#all: bitvector-benchmark #test #bitvector-benchmark +all: test #rbbwt #bitvector-benchmark + +test: test.o $(OBJECTS) + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $< $(OBJECTS) $(LINKFLAGS) + +bitvector-benchmark: bitvector_benchmark.cpp $(HEADERS) + $(CXX) -o $@ $(LINKPATH) $< $(LINKFLAGS) -std=c++11 -O3 -ffast-math -funroll-loops -msse4.2 -march=native -DHAVE_CXA_DEMANGLE + #$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $< $(OBJECTS) -std=c++11 $(LINKFLAGS) + +rbbwt: rbbwt.cpp $(HEADERS) + $(CXX) -o $@ $(LINKPATH) $< $(LINKFLAGS) -g -std=c++11 -Ofast -march=native -fstrict-aliasing + +test.o: test.cpp $(HEADERS) + +clean: + rm -f *.o *.gch test diff --git a/compactds/PartialSum.hpp b/compactds/PartialSum.hpp new file mode 100644 index 0000000..124a851 --- /dev/null +++ b/compactds/PartialSum.hpp @@ -0,0 +1,140 @@ +#ifndef _MOURISL_COMPACTDS_PARTIALSUM +#define _MOURISL_COMPACTDS_PARTIALSUM + +#include "Utils.hpp" +#include "Bitvector_Sparse.hpp" + +namespace compactds { +class PartialSum +{ +private: + Bitvector_Sparse _B ; // underlying sparse bit vector + size_t _n ; + uint64_t _totalSum ; +public: + PartialSum() + { + _n = _totalSum = 0 ; + } + + ~PartialSum() + { + Free() ; + } + + int GetSpace() + { + return _B.GetSpace() + sizeof(*this) ; + } + + void Free() + { + _B.Free() ; + } + + void SetSupportSearch(bool supportSearch) + { + _B.SetSupportRank(supportSearch) ; + } + + void SetSpeed(int speed) + { + _B.SetSpeed(speed) ; + } + + void Init(const int *array, const size_t n) + { + size_t i ; + uint64_t *psum ; + psum = (uint64_t *)malloc(sizeof(*psum) * (n+1)) ; // We store an extra element for all the length in sum + + psum[0] = 0 ; + for (i = 1 ; i < n + 1 ; ++i) + psum[i] = psum[i - 1] + array[i - 1] ; + InitFromPartialSum(psum, n) ; + free(psum) ; + } + + void Init(const size_t *array, const size_t n) + { + size_t i ; + uint64_t *psum ; + psum = (uint64_t *)malloc(sizeof(*psum) * (n+1)) ; // We store an extra element for all the length in sum + + psum[0] = 0 ; + for (i = 1 ; i < n + 1 ; ++i) + psum[i] = psum[i - 1] + array[i - 1] ; + InitFromPartialSum(psum, n) ; + free(psum) ; + } + + // n is the number of elements + // psum records the partial sum before the i-th element + // the last element should be the total sum, so need to store psum[n] + void InitFromPartialSum(const uint64_t *psum, const size_t n) + { + this->_n = n ; + this->_totalSum = psum[n] ; + _B.InitFromOnes(psum, n + 1, _totalSum) ; + } + + // Initalize where the numbers are marked on bit vector + // i.e., the partial sum is the index on the bit array + // It assumes the lowest bit of W[0] is 1, and the last + // index corresponds to the total sum + void InitFromBitvector(WORD *W, const size_t wsize) + { + _B.Init(W, wsize) ; + _n = _B.GetOneCnt() - 1 ; + _totalSum = _B.GetLastOneIdx() ; + } + + // Get the partial sum for index i + // sum_0^[i-1] A[j] + // Another interpretation is the summation for the first i elements. + uint64_t Sum(size_t i) const + { + if (i == 0) + return 0 ; + else if (i >= _n) + return _totalSum ; + else + // The input to Select is 1-based + return _B.Select(i + 1) ; + } + + // Return the max i that Sum(i) <= the value of v + size_t Search(const uint64_t v) const + { + if (v >= _totalSum) + return _n ; + return _B.Rank(1, (size_t)v) - 1 ; + } + + // Read the value of an element + int AccessValue(size_t i) const + { + if (i >= _n) + return -1 ; + return (int)(Sum(i + 1) - Sum(i)) ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _n); + SAVE_VAR(fp, _totalSum); + _B.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + LOAD_VAR(fp, _n); + LOAD_VAR(fp, _totalSum); + _B.Load(fp) ; + } +} ; +} + +#endif diff --git a/compactds/PerfectHash.hpp b/compactds/PerfectHash.hpp new file mode 100644 index 0000000..5322fbf --- /dev/null +++ b/compactds/PerfectHash.hpp @@ -0,0 +1,199 @@ +#ifndef _MOURISL_COMPACTDS_PERFECTHASH +#define _MOURISL_COMPACTDS_PERFECTHASH + +// Generate a perfect hash function given the set of keys +#include "UniversalHashGenerator.hpp" +#include "FractionBitElemArray.hpp" +#include "Bitvector_Plain.hpp" +#include "SimpleVector.hpp" + +#define PERFECT_MAP_KEY_TRIES 3 + +namespace compactds { +class PerfectHash +{ +private: + UniversalHashGenerator uh ; + uint64_t a[PERFECT_MAP_KEY_TRIES], b[PERFECT_MAP_KEY_TRIES] ; // the parameters from the universal hash function + FractionBitElemArray G ; + size_t m ; + + // Map with hash, this include the shift + uint64_t MapWithHashI(uint64_t key, int i) + { + return uh.Map(a[i], b[i], key) + i * (m/PERFECT_MAP_KEY_TRIES) ; + } + + // The method is to give each key three potential slots, + // the goal is to find a map that each slot is assigned by a unique key (one of the three). + // So we process all the keys first, and start from the slot with unique key already + // and release the assignment from the other two slots of the key. + // This may release more slots with unique keys, and we repeat this process + // If there are still ambiguous keys, we return FAIL(0) + // + // I think my implementation is better than the one suggested in the textbook, + // as it does not need to store the tuple and nodes/link map and also use queue instead + // of priority_queue which also gives linear time speed + int InitTry(uint64_t *keys, size_t n, SimpleVector *L, size_t *nL, + size_t *uniqueSlotQueue, WORD *keyIdxProcessed, size_t *S) + { + size_t i ; + int j ; // in this function, j is to iterate hash function tries + size_t Scnt = 0 ; + size_t uniqueSlotQueueS, uniqueSlotQueueE ; + + // Initialize some parametrs. + for (i = 0 ; i < m ; ++i) + L[i].Clear() ; + memset(keyIdxProcessed, 0, Utils::BitsToWordBytes(m)) ; + uniqueSlotQueueS = 0 ; uniqueSlotQueueE = 0 ; //[S..E) + + for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j) + uh.Generate(a[j], b[j]) ; + + // Put all the keys to their slots + for (i = 0 ; i < n ; ++i) + { + for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j) + { + uint64_t target = MapWithHashI(keys[i], j); + //printf("%llu %llu %d: %d %d: %d\n", a[j], b[j], m, i, j, target) ; + L[target].PushBack(i) ; + } + } + + // Initialize the unique slot queue + for (i = 0 ; i < m ; ++i) + { + if (L[i].Size() == 1) + { + uniqueSlotQueue[uniqueSlotQueueE] = i ; + ++uniqueSlotQueueE ; + } + nL[i] = L[i].Size() ; + } + + // main part, identify which slot is unique for a key until now + while (uniqueSlotQueueS < uniqueSlotQueueE) + { + size_t slot = uniqueSlotQueue[uniqueSlotQueueS] ; + ++uniqueSlotQueueS ; + // Since each slot will be removed once + // and the total length of the list PERFECT_MAP_KEY_TRIES*n + // , the overall time is still O(n) + size_t size = L[slot].Size() ; + size_t keyIdx = -1; + for (i = 0 ; i < size ; ++i) + { + if (!Utils::BitRead(keyIdxProcessed, L[slot][i])) + { + keyIdx = L[slot][i] ; + break ; + } + } + if (i >= size) + { + // The l becomes empty, this could happen when a key + // creates more than one unique-mapped slots + continue ; + } + Utils::BitSet(keyIdxProcessed, keyIdx) ; + S[Scnt] = keys[keyIdx] ; + ++Scnt ; + for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j) + { + uint64_t target = MapWithHashI(keys[keyIdx], j) ; + --nL[target] ; + if (nL[target] == 1) // it could be 0, so we should not use <=1 + { + uniqueSlotQueue[uniqueSlotQueueE] = target ; + ++uniqueSlotQueueE ; + } + } + } + if (Scnt < n) + return 0 ; + G.Malloc(3, m) ; // The value of G is {0, 1, 2} + WORD *V = Utils::MallocByBits(m) ; + for (i = 1 ; i <= Scnt ; ++i) + { + size_t key = keys[S[Scnt - i]] ; + uint64_t targets[PERFECT_MAP_KEY_TRIES] ; + int gSumMod = 0 ; + for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j) + { + targets[j] = MapWithHashI(key, j) ; + gSumMod += G.Read(targets[j]) % PERFECT_MAP_KEY_TRIES ; + } + for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j) + { + if (!Utils::BitRead(V, targets[j])) + { + int tmp = (j - gSumMod) % PERFECT_MAP_KEY_TRIES ; + if (tmp < 0) + tmp += PERFECT_MAP_KEY_TRIES ; + G.Write(targets[j], tmp) ; + break ; + } + } + + for (j = 0 ; j < PERFECT_MAP_KEY_TRIES ; ++j) + Utils::BitSet(V, targets[j]) ; + } + free(V) ; + return 1 ; + } +public: + PerfectHash() {} + ~PerfectHash() {} + + size_t GetSpace() + { + return G.GetSpace() - sizeof(G) + sizeof(*this) ; + } + + void Init(uint64_t *keys, size_t n, size_t m) + { + if (m == 0) + m = CEIL(1.25 * n / PERFECT_MAP_KEY_TRIES) * PERFECT_MAP_KEY_TRIES; + this->m = m ; + SimpleVector *L ; // the key list associated with each slot + size_t *nL ; // number of element in each L + size_t *uniqueSlotQueue ; // the queue for slot with unique keys + size_t *S ; // the stack used to store keys + WORD *keyIdxProcessed ; // bit vector represent whether a key has been processed + + L = new SimpleVector[m] ; + nL = (size_t *)malloc(sizeof(size_t) * m) ; + uniqueSlotQueue = (size_t *)malloc(sizeof(size_t) * m) ; + S = (size_t *)malloc(sizeof(size_t) * n) ; + keyIdxProcessed = Utils::MallocByBits(n) ; + + uh.Init(m/PERFECT_MAP_KEY_TRIES, 0) ; + + while (!InitTry(keys, n, L, nL, uniqueSlotQueue, keyIdxProcessed, S)) + ; + + delete[] L ; + free(nL) ; + free(uniqueSlotQueue) ; + free(S) ; + free(keyIdxProcessed) ; + } + + uint64_t Map(uint64_t x) + { + size_t i ; + uint64_t hs[PERFECT_MAP_KEY_TRIES] ; + int gsum = 0 ; + for (i = 0 ; i < PERFECT_MAP_KEY_TRIES ; ++i) + { + hs[i] = MapWithHashI(x, i) ; + gsum += G.Read(hs[i]) ; + } + return hs[gsum %PERFECT_MAP_KEY_TRIES] ; + } +} ; +} + +#endif diff --git a/compactds/Permutation.hpp b/compactds/Permutation.hpp new file mode 100644 index 0000000..a1fac23 --- /dev/null +++ b/compactds/Permutation.hpp @@ -0,0 +1,237 @@ +#ifndef _MOURISL_COMPACTDS_PERMUTATION +#define _MOURISL_COMPACTDS_PERMUTATION + +// Compressed permutation representation. Chapter 5.3 +// So far it assuems at most 2^31 runs +#include "Utils.hpp" +#include "HuffmanCode.hpp" +#include "Bitvector_Plain.hpp" + +namespace compactds { +class Permutation +{ +private: + size_t _space ; + size_t _n ; + size_t _rcnt ; + Bitvector_Plain *_nodeB ; // The left, right child indicator + Bitvector_Plain _G ; // Mark the start position for each run in the permuation representation. + int *_nodePath ; //Buffer to holde the node ids along a path from root to leaf + HuffmanCode _huffmanTree ; + + // Combine the CreateLeaves and CreateBitvectors of the book into the same function + // Using S to hold the sequences mimicing merge sort + // tag: tree id + void CreateBitvectors(const struct _huffman_node *tree, int tag, size_t *S, size_t offset, size_t *Pi) + { + size_t i ; + if (tree[tag].left == -1) + { + // Leaf + for (i = 0 ; i < tree[tag].freq ; ++i) + S[offset + i] = Pi[ _G.Select(1, tree[tag].symbol + 1) + i] ; + } + else + { + CreateBitvectors(tree, tree[tag].left, S, offset, Pi) ; + CreateBitvectors(tree, tree[tag].right, S, offset + tree[tree[tag].left].freq, Pi) ; + + _nodeB[tag].Malloc(tree[tag].freq) ; + _space += (_nodeB[tag].GetSpace() - sizeof(_nodeB[tag])) ; + + // Merge + size_t *buffer = (size_t *)malloc(sizeof(size_t) * tree[tag].freq) ; + size_t lp = offset, rp = offset + tree[tree[tag].left].freq ; // left/right pointer + size_t lcnt, rcnt ; // scanned count of left and right children + lcnt = rcnt = 0 ; + while (lcnt < tree[tree[tag].left].freq && rcnt < tree[ tree[tag].right ].freq) + { + if (S[lp] < S[rp]) + { + buffer[lcnt + rcnt] = S[lp] ; + ++lcnt ; ++lp ; + } + else if (S[lp] > S[rp]) + { + buffer[lcnt + rcnt] = S[rp] ; + _nodeB[tag].BitSet(lcnt + rcnt) ; + ++rcnt ; ++rp ; + } + else + { + // ERROR! + } + } + while (lcnt < tree[tree[tag].left].freq) + { + buffer[lcnt + rcnt] = S[lp] ; + ++lcnt ; ++lp ; + } + while (rcnt < tree[tree[tag].right].freq) + { + buffer[lcnt + rcnt] = S[rp] ; + _nodeB[tag].BitSet(lcnt + rcnt) ; + ++rcnt ; ++rp ; + } + _nodeB[tag].Init() ; + for (i = offset ; i < offset + tree[tag].freq ; ++i) + S[i] = buffer[i - offset] ; + free(buffer) ; + } + } +public: + Permutation() + { + _space = 0 ; + _n = 0 ; + _rcnt = 0 ; + } + + ~Permutation() + { + Free() ; + } + + void Free() + { + if (_n > 0) + { + delete[] _nodeB ; + free(_nodePath) ; + _n = 0 ; + } + } + + size_t GetSpace() + { + return _space + sizeof(*this) ; + } + + void Init(size_t *Pi, size_t n) + { + _n = n ; + size_t i, j ; + std::vector rstarts ; + std::vector rlens ; + _G.Malloc(n) ; + for (i = 0 ; i < n ;) + { + for (j = i + 1 ; j < n ; ++j) + if (Pi[j] < Pi[j - 1]) + break ; + rstarts.push_back(i) ; + rlens.push_back(j - i) ; + _G.BitSet(i) ; + i = j ; + } + _rcnt = rstarts.size() ; + _G.Init() ; + _space += _G.GetSpace() - sizeof(_G) ; + + _huffmanTree.InitFromFrequency(rlens.data(), _rcnt) ; + _space += _huffmanTree.GetSpace() - sizeof(_huffmanTree) ; + + int depth = _huffmanTree.GetDepth( _huffmanTree.GetRoot() ) ; + _nodePath = (int *)malloc(sizeof(_nodePath[0]) * (depth + 1)) ; + _space += sizeof(_nodePath[0]) * (depth + 1) ; + + _nodeB = new Bitvector_Plain[2 * _rcnt - 1] ; + _space += sizeof(Bitvector_Plain) * (2 * _rcnt - 1) ; + size_t *S = (size_t *)malloc(sizeof(size_t) * n) ; + CreateBitvectors(_huffmanTree.GetTree(), _huffmanTree.GetRoot(), S, 0, Pi) ; + /*for (i = 0 ; i < _rcnt ; ++i) + { + int l ; + WORD code = _huffmanTree.Encode(i, l) ; + printf("%d %d %d: %d %d\n", i, rstarts[i], rlens[i], code, l) ; + }*/ + free(S) ; + } + + // Pi(i) + // read() in the book + size_t Next(size_t i) const + { + int j ; + const struct _huffman_node *tree = _huffmanTree.GetTree() ; + int len ; + size_t ri = _G.Rank(1, i) - 1 ; + WORD code = _huffmanTree.Encode(ri, len) ; + + _nodePath[0] = _huffmanTree.GetRoot() ; + for (j = 0 ; j < len ; ++j) + { + if ((code >> (len - j - 1)) & 1) + _nodePath[j + 1] = tree[ _nodePath[j] ].right ; + else + _nodePath[j + 1] = tree[ _nodePath[j] ].left ; + } + + i = i - _G.Select(1, ri + 1) ; + for (j = len - 1 ; j >= 0 ; --j) + { + if ((code >> (len - 1 - j)) & 1) + { + i = _nodeB[_nodePath[j]].Select(1, i + 1) ; + } + else + { + i = _nodeB[_nodePath[j]].Select(0, i + 1) ; + } + } + return i ; + } + + // Pi^-1(i) + // inverse() in the book + size_t Prev(size_t i) const + { + size_t j = i ; // tracking the position of i in a run + const struct _huffman_node *tree = _huffmanTree.GetTree() ; + size_t tag = _huffmanTree.GetRoot() ; + while (tree[tag].left != -1) + { + int b = _nodeB[tag].Access(j) ; + j = _nodeB[tag].Rank(b, j, /*inclusive=*/0) ; + if (b == 0) + tag = tree[tag].left ; + else + tag = tree[tag].right ; + } + + return _G.Select(1, tree[tag].symbol + 1) + j ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, _n) ; + SAVE_VAR(fp, _space) ; + SAVE_VAR(fp, _rcnt) ; + _G.Save(fp) ; + _huffmanTree.Save(fp) ; + size_t i ; + for (i = 0 ; i < 2 * _rcnt - 1 ; ++i) + _nodeB[i].Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + LOAD_VAR(fp, _n) ; + LOAD_VAR(fp, _space) ; + LOAD_VAR(fp, _rcnt) ; + _G.Load(fp) ; + _huffmanTree.Load(fp) ; + size_t i ; + _nodeB = new Bitvector_Plain[2 * _rcnt - 1] ; + for (i = 0 ; i < 2 * _rcnt - 1 ; ++i) + _nodeB[i].Load(fp) ; + + int depth = _huffmanTree.GetDepth( _huffmanTree.GetRoot() ) ; + _nodePath = (int *)malloc(sizeof(_nodePath[0]) * (depth + 1)) ; + } +} ; +} + +#endif diff --git a/compactds/Sequence.hpp b/compactds/Sequence.hpp new file mode 100644 index 0000000..359efe9 --- /dev/null +++ b/compactds/Sequence.hpp @@ -0,0 +1,48 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCE +#define _MOURISL_COMPACTDS_SEQUENCE + +#include "Utils.hpp" +#include "Alphabet.hpp" +#include "FixedSizeElemArray.hpp" + +namespace compactds { +class Sequence +{ +protected: + size_t _space ; + Alphabet _alphabets ; + size_t _n ; // sequence length +public: + Sequence() {_space = 0 ; _n = 0 ;} + ~Sequence() {} + + void SetAlphabet(const Alphabet &a) + { + _alphabets = a ; + } + + virtual void Save(FILE *fp) + { + SAVE_VAR(fp, _space) ; + SAVE_VAR(fp, _n) ; + _alphabets.Save(fp) ; + } + + virtual void Load(FILE *fp) + { + LOAD_VAR(fp, _space) ; + LOAD_VAR(fp, _n) ; + _alphabets.Load(fp) ; + } + + virtual void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) = 0 ; + virtual void Free() = 0 ; + virtual size_t GetSpace() = 0 ; + virtual ALPHABET Access(size_t i) const = 0 ; + virtual size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const = 0 ; + virtual size_t Select(ALPHABET c, size_t i) const = 0 ; + virtual void PrintStats() = 0 ; +} ; +} + +#endif diff --git a/compactds/SequenceCompactor.hpp b/compactds/SequenceCompactor.hpp new file mode 100644 index 0000000..f7b0f70 --- /dev/null +++ b/compactds/SequenceCompactor.hpp @@ -0,0 +1,76 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCECOMPACTOR +#define _MOURISL_COMPACTDS_SEQUENCECOMPACTOR + +// The class that handles convert the raw sequence to FixedSizeElemArray +// I put this class in compactds because FM and Sequence classes assumes +// the input is from compact representation +#include "FixedSizeElemArray.hpp" +#include "Alphabet.hpp" + +namespace compactds { +class SequenceCompactor +{ +private: + bool _capitalize ; + ALPHABET _missingReplace ; + Alphabet _alphabets ; +public: + SequenceCompactor() + { + _capitalize = false ; + _missingReplace = '\0' ; + }; + + ~SequenceCompactor() {} ; + + void Init(const char *alphabetList) + { + _alphabets.InitFromList(alphabetList, strlen(alphabetList)) ; + } + + void Init(const char *alphabetList, FixedSizeElemArray &compactSeq, size_t reserveLength) + { + int alphabetCodeLen = _alphabets.InitFromList(alphabetList, strlen(alphabetList)) ; + compactSeq.Malloc(alphabetCodeLen, reserveLength) ; + compactSeq.SetSize(0) ; + } + + void SetCapitalize(bool c) + { + _capitalize = c ; + } + + void SetMissingReplace(ALPHABET c) + { + _missingReplace = c ; + } + + // @return: number of chars added to seq + size_t Compact(const char *rawseq, FixedSizeElemArray &seq) + { + size_t i ; + size_t origLen = seq.GetSize() ; + for (i = 0 ; rawseq[i] ; ++i) + { + char c = rawseq[i] ; + if (_capitalize) + { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A' ; + } + + if (!_alphabets.IsIn(c)) + { + if (_missingReplace == '\0') + continue ; + else + c = _missingReplace ; + } + seq.PushBack( _alphabets.Encode(c) ) ; + } + + return seq.GetSize() - origLen ; + } +} ; +} +#endif diff --git a/compactds/Sequence_Hybrid.hpp b/compactds/Sequence_Hybrid.hpp new file mode 100644 index 0000000..5bfdbd7 --- /dev/null +++ b/compactds/Sequence_Hybrid.hpp @@ -0,0 +1,328 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCE_HYBRID +#define _MOURISL_COMPACTDS_SEQUENCE_HYBRID + +#include "Sequence.hpp" +#include "Sequence_WaveletTree.hpp" +#include "Sequence_RunLength.hpp" + +namespace compactds { +class Sequence_Hybrid: public Sequence +{ +private: + size_t _b ; // block size + size_t _blockCnt ; + size_t _minAvgRunLength ; // minimum average run length in a block + Bitvector_Plain _useRunLength ; // 0-plain sequence, 1-run length sequence + //size_t **_alphabetBlockPartialSum ; + Sequence_WaveletTree _waveletSeq ; + Sequence_RunLength _runlengthSeq ; + + size_t _blockSizeInferLength ; // use this amount of numbers to infer block size + + size_t EstimateSpace(const FixedSizeElemArray &S, size_t n, size_t b, size_t minRl, int alphabetBit) + { + size_t i, j ; + size_t rlBlockCnt = 0 ; // the number of blocks for run-length representation + size_t rlBlockLen = 0 ; + size_t runCnt = 0 ; // run count in runlength-endcoed sequence. + size_t lastRunChr = 0 ; + for (i = 0 ; i < n ; i += b) + { + uint64_t c = S.Read(i) ; + size_t localRunCnt = 1 ; + for (j = i + 1 ; j < i + b && j < n ; ++j) + { + if (S.Read(j) != c) + { + ++localRunCnt ; + c = S.Read(j) ; + } + } + if ((j - i) / localRunCnt >= minRl) + { + size_t reduce = 0 ; + if (S.Read(i) == lastRunChr) + reduce = 1 ; + runCnt += localRunCnt - reduce ; + rlBlockLen += (j - i) ; + lastRunChr = c ; + ++rlBlockCnt ; + } + } + size_t ret = DIV_CEIL(n, b) + alphabetBit * (n - rlBlockLen) ; + + if (runCnt > 0) + ret += runCnt * Utils::Log2Ceil(n / runCnt) + alphabetBit * runCnt + runCnt * Utils::Log2Ceil(n * 4 / runCnt) ; + + return ret ; + } + + // Use the first m characters from S to determine the best block size + // the blocksize shall minimize the block bit overhead and + // maximize the number of characters that are in the rl-block + size_t ComputeBlockSize(const FixedSizeElemArray &S, size_t n, size_t alphabetSize) + { + size_t i ; + int alphabetBit = Utils::Log2Ceil(alphabetSize) ; + + size_t bestSpace = 0 ; + size_t bestTag = 0 ; + size_t m = (n < _blockSizeInferLength ? n : _blockSizeInferLength) ; + for (i = 4 ; i <= m ; i *= 2) + { + size_t space = EstimateSpace(S, m, i, _minAvgRunLength, alphabetBit) ; + if (bestSpace == 0 || space < bestSpace) + { + bestSpace = space ; + bestTag = i ; + } + } + + if (bestTag <= m) + { + size_t space = EstimateSpace(S, m, bestTag / 2 * 3, _minAvgRunLength, alphabetBit) ; + if (space < bestSpace) + { + bestSpace = space ; + bestTag = bestTag / 2 * 3 ; + } + } + return bestTag ; + } + +public: + Sequence_Hybrid() + { + _b = 0 ; + _minAvgRunLength = 6 ; + _blockSizeInferLength = (1<<20) ; + } + + ~Sequence_Hybrid() + { + Free() ; + } + + void Free() + { + if (_n > 0) + { + //size_t i ; + //size_t alphabetSize = _alphabets.GetSize() ; + //for (i = 0 ; i < alphabetSize ; ++i) + // free(_alphabetBlockPartialSum[i]) ; + //free(_alphabetBlockPartialSum) ; + _n = 0 ; + } + } + + void SetBlockSize(size_t b) + { + _b = b ; + } + + void SetblockSizeInferLength(size_t l) + { + _blockSizeInferLength = l ; + } + + void SetMinAvgRunLength(size_t r) + { + _minAvgRunLength = r ; + } + + size_t GetSpace() + { + return _space + sizeof(*this) + _alphabets.GetSpace() - sizeof(_alphabets); + } + + void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) + { + size_t i, j ; + + _n = sequenceLength ; + size_t alphabetSize = _alphabets.GetSize() ; + if (alphabetSize == 0) + { + _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ; + alphabetSize = _alphabets.GetSize() ; + } + //size_t *psums ; // use this to avoid access the rank in another type of array + //psums = (size_t *)calloc(alphabetSize, sizeof(size_t)) ; + + if (_b == 0) + _b = ComputeBlockSize(S, sequenceLength, alphabetSize) ; + + _blockCnt = DIV_CEIL(_n, _b) ; + size_t runlengthBlockCnt = 0 ; + + WORD *B = Utils::MallocByBits(_blockCnt) ; // block indicator + //_alphabetBlockPartialSum = (size_t **)malloc(sizeof(size_t *) * alphabetSize) ; + //_space += sizeof(size_t *) * alphabetSize ; + /*for (i = 0 ; i < alphabetSize ; ++i) + { + //_alphabetBlockPartialSum[i] = (size_t *)malloc(sizeof(size_t) * (_blockCnt + 1)) ; + //_space += sizeof(size_t) * (_blockCnt + 1) ; + }*/ + + for (i = 0 ; i < _n ; i += _b) + { + //for (j = 0 ; j < alphabetSize ; ++j) + // _alphabetBlockPartialSum[j][i / _b] = psums[j] ; + + int prevc = S.Read(i) ; + //++psums[prevc] ; + size_t rcnt = 1 ; + for (j = 1 ; j < _b && i + j < _n ; ++j) + { + int c = S.Read(i + j) ; + //++psums[c] ; + if (c != prevc) + { + ++rcnt ; + prevc = c ; + } + } + if (_b / rcnt >= _minAvgRunLength) + { + ++runlengthBlockCnt ; + Utils::BitSet(B, i / _b) ; + } + } + //for (j = 0 ; j < alphabetSize ; ++j) + // _alphabetBlockPartialSum[j][i / _b] = psums[j] ; + _useRunLength.Init(B, _blockCnt) ; + + // Split the sequence into two parts + FixedSizeElemArray tmpS ; + tmpS.Malloc(S.GetElemLength(), _n) ; + int k ; // use run length + for ( k = 0 ; k <= 1 ; ++k) + { + size_t size = 0 ; + for (i = 0 ; i < _n ; i += _b) + { + if (Utils::BitRead(B, i / _b) != k) + continue ; + for (j = 0 ; j < _b && i + j < _n ; ++j) + { + tmpS.Write(size, S.Read(i + j)) ; + ++size ; + } + } + + tmpS.SetSize(size) ; + //printf("%d %d\n", _b, size) ; + if (k == 0) + { + if (size > 0) + { + _waveletSeq.SetSelectSpeed( DS_SELECT_SPEED_NO ) ; + _waveletSeq.Init(tmpS, size, alphabetMap) ; + } + } + else + { + if (size > 0) + _runlengthSeq.Init(tmpS, size, alphabetMap) ; + } + } + _space += _useRunLength.GetSpace() - sizeof(_useRunLength) ; + _space += _waveletSeq.GetSpace() - sizeof(_waveletSeq) ; + _space += _runlengthSeq.GetSpace() - sizeof(_runlengthSeq) ; + //printf("%d %d %d\n", sizeof(*this), sizeof(_waveletSeq), sizeof(_runlengthSeq)) ; + + //free(psums) ; + free(B) ; + } + + ALPHABET Access(size_t i) const + { + size_t bi = i / _b ; + int type = _useRunLength.Access(bi) ; + if (type == 0) + { + size_t r = _useRunLength.Rank(1, bi) ; + i -= _b * r ; + return _waveletSeq.Access(i) ; + } + else + { + size_t r = _useRunLength.Rank(0, bi) ; + i -= _b * r ; + return _runlengthSeq.Access(i) ; + } + } + + size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const + { + if (!inclusive) + { + if (i == 0) + return 0 ; + --i ; + } + + size_t bi = i / _b ; + int type = _useRunLength.Access(bi) ; + size_t ranki = _useRunLength.Rank(type, bi) ; + size_t otherRanki = (bi + 1) - ranki ; + + size_t ret = 0 ; + size_t typei = (ranki - 1) * _b + i % _b ; // ranki>=1 because bi is of type. + if (type == 0) + ret = _waveletSeq.Rank(c, typei) ; + else + ret = _runlengthSeq.Rank(c, typei) ; + if (otherRanki == 0) + return ret ; + + size_t otheri = otherRanki * _b - 1 ; + if (type == 0) + ret += _runlengthSeq.Rank(c, otheri) ; + else + ret += _waveletSeq.Rank(c, otheri) ; + + return ret ; + } + + size_t Select(ALPHABET c, size_t i) const + { + return 0 ; + } + + void Save(FILE *fp) + { + Sequence::Save(fp) ; + SAVE_VAR(fp, _b) ; + SAVE_VAR(fp, _blockCnt) ; + SAVE_VAR(fp, _minAvgRunLength) ; + _useRunLength.Save(fp) ; + _waveletSeq.Save(fp) ; + _runlengthSeq.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + Sequence::Load(fp) ; + LOAD_VAR(fp, _b) ; + LOAD_VAR(fp, _blockCnt) ; + LOAD_VAR(fp, _minAvgRunLength) ; + _useRunLength.Load(fp) ; + _waveletSeq.Load(fp) ; + _runlengthSeq.Load(fp) ; + } + + void PrintStats() + { + Utils::PrintLog("Sequence_Hybrid: total_length: %lu block_size: %lu min_avg_runlength: %lu runlength_block: %lu", + _n, _b, _minAvgRunLength, _useRunLength.Rank(1, _blockCnt - 1)) ; + _runlengthSeq.PrintStats() ; + _waveletSeq.PrintStats() ; + } +} ; +} + +#endif diff --git a/compactds/Sequence_Permutation.hpp b/compactds/Sequence_Permutation.hpp new file mode 100644 index 0000000..03137be --- /dev/null +++ b/compactds/Sequence_Permutation.hpp @@ -0,0 +1,70 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCE_PERMUTATION +#define _MOURISL_COMPACTDS_SEQUENCE_PERMUTATION + +#include "Utils.hpp" +#include "Alphabet.hpp" +#include "FixedSizeElemArray.hpp" +#include "Sequence.hpp" + +namespace compactds { +class Sequence_Permutation: public Sequence +{ +private: +public: + void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) + { + } + + void Free() + { + } + + size_t GetSpace() + { + } + + ALPHABET Access(size_t i) const + { + return AccessLong(i) ; + } + + size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const + { + return RankLong(c, i, inclusive) ; + } + + size_t Select(ALPHABET c, size_t i) const + { + return SelectLong(c, i) ; + } + + size_t AccessLong(size_t i) const + { + } + + size_t RankLong(size_t c, size_t i, int inclusive = 1) const + { + } + + size_t SelectLong(size_t c, size_t i) const + { + } + + + void Save(FILE *fp) + { + Sequence::Save(fp) ; + } + + void Load(FILE *fp) + { + Sequence::Load(fp) ; + } + + void PrintStats() + { + } +} ; +} + +#endif diff --git a/compactds/Sequence_Plain.hpp b/compactds/Sequence_Plain.hpp new file mode 100644 index 0000000..a93e568 --- /dev/null +++ b/compactds/Sequence_Plain.hpp @@ -0,0 +1,101 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCE_PLAIN +#define _MOURISL_COMPACTDS_SEQUENCE_PLAIN + +#include "Utils.hpp" +#include "Alphabet.hpp" +#include "Sequence.hpp" + +#include "Bitvector_Plain.hpp" +#include "Bitvector_RunLength.hpp" + +// The sequence representation where each alphabet is a bitvector +namespace compactds { +template +class Sequence_Plain: public Sequence +{ +private: + BvClass *_Bvs ; // bitvectors + int _selectSpeed ; +public: + Sequence_Plain() + { + _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; + _space = 0; + } + + ~Sequence_Plain() + { + Free() ; + } + + void Free() + { + delete[] _Bvs ; + } + + size_t GetSpace() + { + return _space + _alphabets.GetSpace() - sizeof(_alphabets) + sizeof(*this) ; + } + + void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) + { + size_t i, j, k ; + _space = 0 ; + + if (_alphabets.GetSize() == 0) + _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ; + + this->_n = sequenceLength ; + size_t alphabetSize = _alphabets.GetSize() ; + + _Bvs = new BvClass[alphabetSize] ; + WORD *B = Utils::MallocByBits(_n) ; + for (i = 0 ; i < alphabetSize ; ++i) + { + for (j = 0 ; j < _n ; j += WORDBITS) + { + WORD w = 0 ; + for (k = 0 ; k < WORDBITS && j + k < _n ; ++k) + { + if (S.Read(j + k) == (int)i) + w |= (1ull< + +#include "Sequence.hpp" +#include "Sequence_WaveletTree.hpp" + +// Split the original sequence into fixed-length blocks, +// compress the single-run block by reducing it to one character +namespace compactds { +class Sequence_RunBlock: public Sequence +{ +private: + size_t _b ; // block size + size_t _blockCnt ; + Bitvector_Plain _useRunBlock ; // 0-plain sequence, 1-homo polymer sequence + //size_t **_alphabetBlockPartialSum ; + Sequence_WaveletTree _waveletSeq ; + Sequence_WaveletTree _runBlockSeq ; + + // Variables and functions related to automatic block size estimation + size_t _blockSizeInferLength ; // use this amount of numbers to infer block size + size_t EstimateSpace(const FixedSizeElemArray &S, size_t n, size_t b, int alphabetBit) + { + size_t i, j ; + size_t runBlockCnt = 0 ; + size_t runBlockLen = 0 ; + for (i = 0 ; i < n ; i += b) + { + uint64_t c = S.Read(i) ; + bool runBlockFlag = true ; + for (j = i + 1 ; j < i + b && j < n ; ++j) + { + if (S.Read(j) != c) + { + runBlockFlag = false ; + break ; + } + } + if (runBlockFlag) + { + ++runBlockCnt ; + runBlockLen += (j - i) ; + } + } + return DIV_CEIL(n, b) + alphabetBit * (runBlockCnt + n - runBlockLen) ; + } + + // Use the first m characters from S to determine block size + size_t ComputeBlockSize(const FixedSizeElemArray &S, size_t n, size_t alphabetSize) + { + size_t i ; + int alphabetBit = Utils::Log2Ceil(alphabetSize) ; + + size_t bestSpace = 0 ; + size_t bestTag = 0 ; + size_t m = (n < _blockSizeInferLength ? n : _blockSizeInferLength) ; + for (i = 4 ; i <= m ; i *= 2) + { + size_t space = EstimateSpace(S, m, i, alphabetBit) ; + if (bestSpace == 0 || space < bestSpace) + { + bestSpace = space ; + bestTag = i ; + } + } + + if (bestTag <= m) + { + size_t space = EstimateSpace(S, m, bestTag / 2 * 3, alphabetBit) ; + if (space < bestSpace) + { + bestSpace = space ; + bestTag = bestTag / 2 * 3 ; + } + + size_t r = 0 ; + size_t c = S.Read(0) ; + for (i = 1 ; i < m ; ++i) + { + size_t tmp = S.Read(i) ; + if (tmp != c) + { + ++r ; + c = tmp ; + } + } + size_t testSize = CEIL(sqrt((double)m/(double)r)) ; + if (testSize > 2) + { + space = EstimateSpace(S, m, testSize, alphabetBit) ; + if (space < bestSpace) + { + bestSpace = space ; + bestTag = testSize ; + } + } + } + return bestTag ; + } + +public: + Sequence_RunBlock() + { + _b = 0 ; + _blockSizeInferLength = (1<<20) ; + } + + ~Sequence_RunBlock() + { + Free() ; + } + + void Free() + { + if (_n > 0) + { + //size_t i ; + //size_t alphabetSize = _alphabets.GetSize() ; + //for (i = 0 ; i < alphabetSize ; ++i) + // free(_alphabetBlockPartialSum[i]) ; + //free(_alphabetBlockPartialSum) ; + _n = 0 ; + } + } + + void SetBlockSize(size_t b) + { + _b = b ; + } + + void SetBlockSizeInferLength(size_t l) + { + _blockSizeInferLength = l ; + } + + size_t GetSpace() + { + bool inclusive = true ; + return _space + _alphabets.GetSpace() - sizeof(_alphabets) + + (inclusive ? sizeof(*this) : 0) ; + } + + void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) + { + size_t i, j ; + + _n = sequenceLength ; + size_t alphabetSize = _alphabets.GetSize() ; + if (alphabetSize == 0) + { + _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ; + alphabetSize = _alphabets.GetSize() ; + } + + if (_b == 0) + _b = ComputeBlockSize(S, sequenceLength, alphabetSize) ; + + _blockCnt = DIV_CEIL(_n, _b) ; + + WORD *B = Utils::MallocByBits(_blockCnt) ; // block indicator + + for (i = 0 ; i < _n ; i += _b) + { + int prevc = S.Read(i) ; + size_t rcnt = 1 ; + for (j = 1 ; j < _b && i + j < _n ; ++j) + { + int c = S.Read(i + j) ; + if (c != prevc) + { + ++rcnt ; + prevc = c ; + break ; + } + } + if (rcnt == 1) + { + Utils::BitSet(B, i / _b) ; + } + } + _useRunBlock.SetSelectSpeed(DS_SELECT_SPEED_NO) ; + _useRunBlock.Init(B, _blockCnt) ; + + // Split the sequence into two parts + FixedSizeElemArray tmpS ; + tmpS.Malloc(S.GetElemLength(), _n) ; + int k ; // use run lbock + for ( k = 0 ; k <= 1 ; ++k) + { + size_t size = 0 ; + for (i = 0 ; i < _n ; i += _b) + { + if (Utils::BitRead(B, i / _b) != k) + continue ; + if (k == 0) + { + for (j = 0 ; j < _b && i + j < _n ; ++j) + { + tmpS.Write(size, S.Read(i + j)) ; + ++size ; + } + } + else + { + tmpS.Write(size, S.Read(i)) ; + ++size ; + } + } + + tmpS.SetSize(size) ; + //printf("%d %d\n", _b, size) ; + if (k == 0) + { + if (size > 0) + { + _waveletSeq.SetSelectSpeed( DS_SELECT_SPEED_NO ) ; + _waveletSeq.Init(tmpS, size, alphabetMap) ; + } + } + else + { + if (size > 0) + { + _runBlockSeq.SetSelectSpeed( DS_SELECT_SPEED_NO ) ; + _runBlockSeq.Init(tmpS, size, alphabetMap) ; + } + } + } + _space += _useRunBlock.GetSpace() - sizeof(_useRunBlock) ; + _space += _waveletSeq.GetSpace() - sizeof(_waveletSeq) ; + _space += _runBlockSeq.GetSpace() - sizeof(_runBlockSeq) ; + //printf("%d %d %d\n", sizeof(*this), sizeof(_waveletSeq), sizeof(_runBlockSeq)) ; + + free(B) ; + } + + ALPHABET Access(size_t i) const + { + size_t bi = i / _b ; + int type = _useRunBlock.Access(bi) ; + if (type == 0) + { + size_t r = _useRunBlock.Rank(1, bi) ; + i -= _b * r ; + return _waveletSeq.Access(i) ; + } + else + { + size_t r = _useRunBlock.Rank(0, bi) ; + i -= _b * r ; + return _runBlockSeq.Access(i/_b) ; + } + } + + size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const + { + if (!inclusive) + { + if (i == 0) + return 0 ; + --i ; + } + + size_t bi = i / _b ; + int type = _useRunBlock.Access(bi) ; + size_t ranki = _useRunBlock.Rank(type, bi) ; + size_t otherRanki = (bi + 1) - ranki ; + + size_t ret = 0 ; + if (type == 0) + ret = _waveletSeq.Rank(c, (ranki - 1) * _b + i % _b) ; // ranki>=1 because bi is of type. + else + { + bool inRun = true ; + size_t rbRank = _runBlockSeq.RankAndTest(c, ranki - 1, inRun) ; // type==1 makes sure ranki >= 1 + if (inRun) // This makes sure rbRank>=1 at (ranki-1) + ret = (rbRank - 1) * _b + i % _b + 1; + else + ret = rbRank * _b ; + } + + if (otherRanki == 0) + { + return ret ; + } + if (type == 0) + ret += _runBlockSeq.Rank(c, otherRanki - 1) * _b ; + else + ret += _waveletSeq.Rank(c, otherRanki * _b - 1) ; + + return ret ; + } + + size_t Select(ALPHABET c, size_t i) const + { + return 0 ; + } + + void Decompress(FixedSizeElemArray &S) + { + S.Free() ; + + size_t i, j ; + size_t rbIdx = 0 ; + size_t alphabetBit = Utils::Log2Ceil(_alphabets.GetSize()) ; + S.Malloc(alphabetBit, _n) ; + for (i = 0 ; i < _n ; i += _b) + { + size_t k = i / _b ; + if (_useRunBlock.Access(k) == 1) + { + size_t c = _runBlockSeq.Access(rbIdx) ; + for (j = i ; j < i + _b && j < _n ; ++j) + S.Write(j, c) ; + ++rbIdx ; + } + else + { + size_t l = i - _b * rbIdx ; + for (j = i ; j < i + _b && j < _n ; ++j, ++l) + { + size_t c = _waveletSeq.Access(l) ; + S.Write(j, c) ; + } + } + } + } + + void Save(FILE *fp) + { + Sequence::Save(fp) ; + SAVE_VAR(fp, _b) ; + SAVE_VAR(fp, _blockCnt) ; + _useRunBlock.Save(fp) ; + _waveletSeq.Save(fp) ; + _runBlockSeq.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + Sequence::Load(fp) ; + LOAD_VAR(fp, _b) ; + LOAD_VAR(fp, _blockCnt) ; + _useRunBlock.Load(fp) ; + _waveletSeq.Load(fp) ; + _runBlockSeq.Load(fp) ; + } + + void PrintStats() + { + Utils::PrintLog("Sequence_RunBlock: total_length: %lu block_size: %lu runBlock_block: %lu", + _n, _b, _useRunBlock.Rank(1, _blockCnt - 1)) ; + _runBlockSeq.PrintStats() ; + _waveletSeq.PrintStats() ; + } +} ; +} + +#endif diff --git a/compactds/Sequence_RunLength.hpp b/compactds/Sequence_RunLength.hpp new file mode 100644 index 0000000..028ac80 --- /dev/null +++ b/compactds/Sequence_RunLength.hpp @@ -0,0 +1,191 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCE_RUNLENGTH +#define _MOURISL_COMPACTDS_SEQUENCE_RUNLENGTH + +#include "Sequence.hpp" +#include "Sequence_WaveletTree.hpp" +#include "PartialSum.hpp" +#include "SimpleVector.hpp" + +// This sequence type assumes the alphabet coder is plain. +namespace compactds { +class Sequence_RunLength : public Sequence +{ +private: + Bitvector_Sparse _runs ; // mark the beginning of each runs, E in the manuscript + Sequence_WaveletTree _runChars ; // the character for each run, supporting ranking, L' in the manuscript + PartialSum *_alphabetPartialSum ; // the partial length with respect to each alphabet, D in the manuscript + size_t _rcnt ; +public: + Sequence_RunLength() + { + _n = _rcnt = 0 ; + } + + ~Sequence_RunLength() + { + Free() ; + } + + void Free() + { + if (_n > 0) + { + _n = _rcnt = 0 ; + delete[] _alphabetPartialSum ; + } + } + + size_t GetSpace() + { + return _space + _alphabets.GetSize() - sizeof(_alphabets) + sizeof(*this) ; + } + + void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) + { + size_t i ; + uint8_t c ; // character + + if (_alphabets.GetSize() == 0) + _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ; + + // Get the runs + _n = sequenceLength ; + _rcnt = 1 ; + c = S.Read(0) ; + for (i = 1 ; i < sequenceLength ; ++i) + { + if ( S.Read(i) != c) + { + ++_rcnt ; + c = S.Read(i) ; + } + } + + FixedSizeElemArray chars ; + WORD *W = Utils::MallocByBits(sequenceLength + 2) ; + chars.Malloc(S.GetElemLength(), _rcnt) ; + + c = S.Read(0) ; + chars.Write(0, c) ; + Utils::BitSet(W, 0) ; + _rcnt = 1 ; + for (i = 1 ; i < sequenceLength ; ++i) + { + if (S.Read(i) != c) + { + c = S.Read(i) ; + + Utils::BitSet(W, i) ; + chars.Write(_rcnt, c) ; + ++_rcnt ; + } + } + //Utils::BitSet(W, sequenceLength) ; + _runs.Init(W, sequenceLength + 2) ; + _runChars.SetSelectSpeed( DS_SELECT_SPEED_NO ) ; + _runChars.Init(chars, _rcnt, alphabetMap) ; + _space = _runs.GetSpace() - sizeof(_runs) + _runChars.GetSpace() - sizeof(_runChars) ; + + + // Process the runs/partial sums for each alphabet + int alphabetSize = _alphabets.GetSize() ; + _alphabetPartialSum = new PartialSum[alphabetSize] ; + for (c = 0 ; c < alphabetSize ; ++c) + { + memset(W, 0, Utils::BitsToWords(sequenceLength) * sizeof(WORD)) ; + size_t psum = 0 ; + Utils::BitSet(W, 0) ; + for (i = 0 ; i < sequenceLength ; ) + { + if (S.Read(i) != c) + { + ++i ; + continue ; + } + size_t j ; + for (j = i ; j < sequenceLength ; ++j) + if (S.Read(j) != c) + break ; + psum += j - i ; + Utils::BitSet(W, psum) ; + i = j ; + } + _alphabetPartialSum[c].InitFromBitvector(W, psum + 1) ; + _space += _alphabetPartialSum[c].GetSpace() - sizeof(_alphabetPartialSum[c]) ; + } + free(W) ; + } + + ALPHABET Access(size_t i) const + { + return _runChars.Access(_runs.Rank(1, i) - 1) ; + } + + size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const + { + if (!inclusive) + { + if (i == 0) + return 0 ; + else + --i ; + } + + size_t cid = _alphabets.Encode(c) ; + size_t rrank = _runs.Rank(1, i) ; // rank in runs + + bool inRun = true ; + size_t crank = _runChars.RankAndTest(c, rrank - 1, inRun) ; // rank for this character + //printf("%c %d: rrank=%d crank=%d\n", c, i, rrank, crank) ; + if (inRun) + { + size_t psum = _alphabetPartialSum[cid].Sum(crank - 1) ; + //printf("%d %d. ret=%d\n", psum, _runs.Select(1, rrank), + // psum + i - _runs.Select(1, rrank) + 1) ; + return psum + i - _runs.Select(1, rrank) + 1 ; + } + else + { + //printf("other %d\n", _alphabetPartialSum[cid].Sum(crank)) ; + return _alphabetPartialSum[cid].Sum(crank) ; + } + } + + // Not supported + size_t Select(ALPHABET c, size_t i) const + { + return 0 ; + } + + void Save(FILE *fp) + { + Sequence::Save(fp) ; + SAVE_VAR(fp, _rcnt) ; + _runs.Save(fp) ; + _runChars.Save(fp) ; + int alphabetSize = _alphabets.GetSize() ; + for (int i = 0 ; i < alphabetSize ; ++i) + _alphabetPartialSum[i].Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + Sequence::Load(fp) ; + LOAD_VAR(fp, _rcnt) ; + _runs.Load(fp) ; + _runChars.Load(fp) ; + int alphabetSize = _alphabets.GetSize() ; + _alphabetPartialSum = new PartialSum[alphabetSize] ; + for (int i = 0 ; i < alphabetSize ; ++i) + _alphabetPartialSum[i].Load(fp) ; + } + + void PrintStats() + { + Utils::PrintLog("Sequence_RunLength: total_length: %lu run count: %lu", _n, _rcnt) ; + } +} ; +} + +#endif diff --git a/compactds/Sequence_WaveletTree.hpp b/compactds/Sequence_WaveletTree.hpp new file mode 100644 index 0000000..440de14 --- /dev/null +++ b/compactds/Sequence_WaveletTree.hpp @@ -0,0 +1,338 @@ +#ifndef _MOURISL_COMPACTDS_SEQUENCE_WAVELETTREE +#define _MOURISL_COMPACTDS_SEQUENCE_WAVELETTREE + +#include "Utils.hpp" +#include "Sequence.hpp" + +#include + +#include "Bitvector_Plain.hpp" +#include "Bitvector_RunLength.hpp" + +namespace compactds { +template +struct _sequence_wavelettree_node +{ + BvClass v ; + WORD prefix ; // the + int prefixLen ; // bits in prefix + int children[2] ; + + void Save(FILE *fp) + { + SAVE_VAR(fp, prefix) ; + SAVE_VAR(fp, prefixLen) ; + SAVE_ARR(fp, children, 2) ; + v.Save(fp) ; + } + + void Load(FILE *fp) + { + LOAD_VAR(fp, prefix) ; + LOAD_VAR(fp, prefixLen) ; + LOAD_ARR(fp, children, 2) ; + v.Load(fp) ; + } +} ; + +// The implementation of wavelet tree in either +// perfect balanced or huffman shape, +// depending on the choice of alphabet. +template +class Sequence_WaveletTree: public Sequence +{ +private: + struct _sequence_wavelettree_node *_T ; + int _tNodeCnt ; + int _selectSpeed ; + + // Based on the pos-th bits (0-index, count from leftside) + // maxPosToRight: record the maximum distance from pos to right side. + // return: the number of 1s + uint64_t ConvertSequenceToBits(const FixedSizeElemArray &S, const ALPHABET *alphabetMap, int pos, WORD *v, int &maxPosToRight) + { + size_t i ; + size_t n = S.GetSize() ; + uint64_t ret = 0; + maxPosToRight = 0 ; + + for (i = 0 ; i < n ; ++i) + { + int codeLen = 0 ; + int b = _alphabets.Encode( alphabetMap[S.Read(i)], codeLen ) ; + if (codeLen - pos > maxPosToRight) + maxPosToRight = codeLen - pos ; + if (b & (1<<(codeLen - pos - 1))) + { + Utils::BitSet(v, i) ; // Assume the array is already initated to be all 0 + ++ret ; + } + } + return ret ; + } + + // Assume left and right's memory has been allocated. + void SplitSequence(const FixedSizeElemArray &orig, WORD *v, + FixedSizeElemArray &left, FixedSizeElemArray &right) + { + size_t i ; + size_t len = orig.GetSize() ; + + size_t leftLen = 0 ; + size_t rightLen = 0 ; + for (i = 0 ; i < len ; ++i) + { + if (!Utils::BitRead(v, i)) + { + left.Write(leftLen, orig.Read(i)) ; + ++leftLen ; + } + else + { + right.Write(rightLen, orig.Read(i)) ; + ++rightLen ; + } + } + } + + // The recursive function that construct the tree. + // Assumes that the leaf node always has the brother. + // depth: how many bits has been processed so far + // tused: the number of wavelet tree node used so far + // bufferv: preallcoated memory to holding temporary bit array + // return: node id (index in T) + int BuildTree(const FixedSizeElemArray &S, const ALPHABET *alphabetMap, int depth, WORD prefix, WORD *bufferv) + { + size_t len = S.GetSize() ; + int ti = _tNodeCnt ; + ++_tNodeCnt ; + int remainingBits ; + + memset(bufferv, 0, Utils::BitsToWordBytes(len)) ; + uint64_t onecnt = ConvertSequenceToBits(S, alphabetMap, depth, bufferv, remainingBits) ; + _T[ti].v.SetSelectSpeed(_selectSpeed) ; + _T[ti].v.Init(bufferv, len) ; + _space += _T[ti].v.GetSpace() - sizeof(_T[ti].v) ; + _T[ti].prefix = prefix ; + _T[ti].prefixLen = depth ; + if (remainingBits == 1 || S.GetSize() == 0) + { + // Reach leaf. + _T[ti].children[0] = _T[ti].children[1] = -1 ; + return ti ; + } + FixedSizeElemArray leftS, rightS ; // the memory should be automatically released + leftS.Malloc(S.GetElemLength(), len - onecnt) ; + rightS.Malloc(S.GetElemLength(), onecnt) ; + SplitSequence(S, bufferv, leftS, rightS) ; + + _T[ti].children[0] = BuildTree(leftS, alphabetMap, depth + 1, prefix << 1, bufferv) ; + _T[ti].children[1] = BuildTree(rightS, alphabetMap, depth + 1, (prefix << 1) | 1ull, bufferv) ; + + return ti ; + } + + int AccessInNode(int ti, size_t i) const + { + return _T[ti].v.Access(i) ; + } + + // Calculate the rank(type, i) in T[ti] + size_t RankInNode(int ti, int type, size_t i, int inclusive = 1) const + { + return _T[ti].v.Rank(type, i, inclusive) ; + } + + size_t SelectInNode(int ti, int type, size_t i) const + { + return _T[ti].v.Select(type, i); + } + + // Recursive function for Select + // c: the code for the alphabet. + // l: the length of the code + // i: the select we want to query. The ith chracter c. + // ti: tree node idx + // depth: the recursive depth + size_t RecursiveSelect(WORD c, int l, size_t i, int ti, int depth ) const + { + int b = (c >> (l-depth-1)) & 1 ; + if (depth >= l - 1) + { + return SelectInNode(ti, b, i) ; + } + + // Need the +1 to convert the index from Select to the rank as the input to Select + return SelectInNode(ti, b, RecursiveSelect(c, l, i, _T[ti].children[b], depth + 1) + 1 ) ; + } +public: + Sequence_WaveletTree() + { + _tNodeCnt = 0 ; + _selectSpeed = BITVECTOR_DEFAULT_SELECT_SPEED ; + } + + ~Sequence_WaveletTree() {Free() ;} + + void Free() + { + if (_tNodeCnt) + { + delete[] _T ; + _T = NULL ; + _tNodeCnt = 0 ; + } + } + + void SetSelectSpeed(int speed) + { + _selectSpeed = speed ; + } + + size_t GetSpace() {return _space + _alphabets.GetSpace() - sizeof(_alphabets) + sizeof(this) ;} + + // We compactly represent the input sequence as fixed-size element array in a plain fashion + // just to save some memory when construct the tree. + void Init(const FixedSizeElemArray &S, size_t sequenceLength, const ALPHABET *alphabetMap) + { + _space = 0 ; + this->_n = sequenceLength ; + + if (_alphabets.GetSize() == 0) + _alphabets.InitFromList(alphabetMap, strlen(alphabetMap)) ; + + _T = new struct _sequence_wavelettree_node[_alphabets.GetAlphabetCapacity() - 1] ; + _tNodeCnt = 0 ; + _space += sizeof(*_T) * (_alphabets.GetAlphabetCapacity() - 1) ; + + WORD *bufferv = Utils::MallocByBits(sequenceLength) ; + BuildTree(S, alphabetMap, 0, 0, bufferv) ; + free(bufferv) ; + } + + // Return: the alphabet at position i. + ALPHABET Access(size_t i) const + { + int l = 0 ; + WORD code = 0 ; + int ti = 0 ; + for (l = 0 ; ti != -1 ; ++l) + { + int b = AccessInNode(ti, i) ; + code = (code << 1) | b ; + // Need -1 to convert the rank number to array index. + // There is no need to check the negativity from -1, + // because we know the current bit is 0, so rank>=1. + //i = _T[ti].v.Rank(b, i) - 1 ; + i = RankInNode(ti, b, i) - 1 ; + ti = _T[ti].children[b] ; + } + return _alphabets.Decode(code, l) ; + } + + // Return: the number of alphabet c's in [0..i] + size_t Rank(ALPHABET c, size_t i, int inclusive = 1) const + { + int l = 0 ; // l: the length of the code + WORD code = _alphabets.Encode(c, l) ; + int depth = 0 ; + int ti = 0 ; + if (!inclusive) // Since in the wavelet tree, the non-inclusive operation should only + // happen in the leaf node, so we directly modify i here for simplicity. + { + if (i == 0) + return 0 ; + else + --i ; + } + for (depth = 0 ; depth < l ; ++depth) + { + int b = (code >> (l - depth - 1)) & 1 ; + + //i = _T[ti].v.Rank(b, i) ; + i = RankInNode(ti, b, i) ; + + if (i == 0 || depth == l - 1) + break ; + // R count the number of 1's (or 0's), so we need to -1 to change it to 0-based index + // as in the bitvector. + --i ; + ti = _T[ti].children[b] ; + } + return i ; + } + + // Return: rank of c in [0..i] (inclusive), + // also test whether T[i]==c, return through isC + size_t RankAndTest(ALPHABET c, size_t i, bool &isC) const + { + int l = 0 ; // l: the length of the code + WORD code = _alphabets.Encode(c, l) ; + int depth = 0 ; + int ti = 0 ; + + isC = true ; + for (depth = 0 ; depth < l ; ++depth) + { + int b = (code >> (l - depth - 1)) & 1 ; + if (isC && b != AccessInNode(ti, i)) + isC = false ; + + //i = _T[ti].v.Rank(b, i) ; + i = RankInNode(ti, b, i) ; + + if (i == 0 || depth == l - 1) + break ; + // R count the number of 1's (or 0's), so we need to -1 to change it to 0-based index + // as in the bitvector. + --i ; + ti = _T[ti].children[b] ; + } + return i ; + } + + + + // return: the index of the ith (1-based) c + size_t Select(ALPHABET c, size_t i) const + { + int l = 0 ; + WORD code = _alphabets.Encode(c, l) ; + return RecursiveSelect(code, l, i, 0, 0) ; + } + + void Save(FILE *fp) + { + Sequence::Save(fp) ; + SAVE_VAR(fp, _tNodeCnt) ; + SAVE_VAR(fp, _selectSpeed) ; + int i ; + for (i = 0 ; i < _tNodeCnt ; ++i) + _T[i].Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + Sequence::Load(fp) ; + LOAD_VAR(fp, _tNodeCnt) ; + LOAD_VAR(fp, _selectSpeed) ; + + if (_alphabets.GetSize() == 0) //empty tree + return ; + + _T = new struct _sequence_wavelettree_node[_tNodeCnt] ; + int i ; + for (i = 0 ; i < _tNodeCnt ; ++i) + _T[i].Load(fp) ; + } + + void PrintStats() + { + Utils::PrintLog("Sequence_WaveletTree: total_length: %lu node_cnt: %lu", _n, _tNodeCnt) ; + } + +} ; +} + +#endif diff --git a/compactds/SimpleVector.hpp b/compactds/SimpleVector.hpp new file mode 100644 index 0000000..8f51070 --- /dev/null +++ b/compactds/SimpleVector.hpp @@ -0,0 +1,388 @@ +#ifndef _LSONG_SIMPLE_VECTOR_HEADER +#define _LSONG_SIMPLE_VECTOR_HEADER + +// A light version of vector, which increase the size of the array by +// a value no more than specified if it got overflow. +// And the type of elements is basic. + +#include +#include +#include + +//const int maxInc = -1 ; + +template +class SimpleVector +{ +private: + int size ; + int capacity ; + int maxInc ; // The maximal value we can use to increase the capacity. + int inc ; + T *s ; +public: + SimpleVector() : maxInc( -1 ) + { + s = NULL ; + size = capacity = 0 ; + inc = 1 ; + } + + SimpleVector( int mi ): maxInc( mi ) + { + s = NULL ; + size = capacity = 0 ; + inc = 1 ; + } + + SimpleVector( const SimpleVector &in ) + { + size = in.size ; + capacity = in.capacity ; + if ( capacity > 0 ) + { + //s = in.s ; + //if ( in.s == NULL ) + // printf( "null s. %d %d\n", in.size, in.capacity ) ; + s = (T *)malloc( sizeof( T ) * capacity ) ; + memcpy( s, in.s, sizeof( T ) * capacity ) ; + } + else + s = NULL ; + inc = in.inc ; + maxInc = in.maxInc ; + } + + SimpleVector& operator=( const SimpleVector &in ) + { + if ( this != &in ) + { + if ( s != NULL ) + free( s ) ; + size = in.size ; + capacity = in.capacity ; + + if ( capacity > 0 ) + { + //s = in.s ; + s = (T *)malloc( sizeof( T ) * capacity ) ; + memcpy( s, in.s, sizeof( T ) * capacity ) ; + } + else + s = NULL ; + + inc = in.inc ; + maxInc = in.maxInc ; + } + return *this ; + } + + ~SimpleVector() + { + if ( s != NULL ) + free( s ) ; + capacity = 0 ; + size = 0 ; + } + + void Release() + { + if ( s != NULL ) + free( s ) ; + s = NULL ; + size = capacity = 0 ; + } + + void Reserve( int sz ) + { + if ( s != NULL ) + free( s ) ; + s = (T *)malloc( sizeof( T ) * sz ) ; + size = 0 ; + capacity = sz ; + inc = sz ; + + if ( maxInc > 0 && inc > maxInc ) + inc = maxInc ; + } + + + int PushBack( const T &in ) + { + if ( size == capacity ) + { + //int tmp = capacity ; + capacity += inc ; + inc *= 2 ; + if ( maxInc > 0 && inc > maxInc ) + inc = maxInc ; + if ( size == 0 ) + s = (T *)malloc( sizeof( T ) * capacity ) ; + else + s = (T *)realloc( s, sizeof( T ) * capacity ) ; + if ( s == NULL ) + { + fprintf( stderr, "%s: Failed to allocate memory.\n", __func__ ) ; + exit( 1 ) ; + } + } + s[ size ] = in ; + ++size ; + return size ; + } + + int PushBack( const SimpleVector &in ) + { + int newsize = size + in.size ; + if ( newsize > capacity ) + { + //int tmp = capacity ; + capacity = newsize + inc ; + inc *= 2 ; + if ( maxInc > 0 && inc > maxInc ) + inc = maxInc ; + if ( size == 0 ) + s = (T *)malloc( sizeof( T ) * capacity ) ; + else + s = (T *)realloc( s, sizeof( T ) * capacity ) ; + if ( s == NULL ) + { + fprintf( stderr, "%s: Failed to allocate memory.\n", __func__ ) ; + exit( 1 ) ; + } + } + memcpy( s + size, in.s, sizeof( T ) * in.size ) ; + size = newsize ; + return size ; + } + + T PopBack() + { + if ( size == 0 ) + { + fprintf( stderr, "%s: empty array.\n", __func__ ) ; + exit( 1 ) ; + } + --size ; + return s[size] ; + } + + int GetInc() + { + return inc ; + } + + void SetInc( int in ) + { + inc = in ; + } + + void SetMaxInc( int in ) + { + maxInc = in ; + } + int GetMaxInc() + { + return maxInc ; + } + int Size() const + { + return size ; + } + + int Resize( int s ) + { + size = s ; + return size ; + } + + int Capacity() + { + return capacity ; + } + + T &Get( int i ) + { + if ( i >= size ) + { + fprintf( stderr, "%s: Access out of the vector.\n", __func__ ) ; + exit( 1 ) ; + } + return s[i] ; + } + + T &operator[]( int i ) const + { + /*if ( i >= size ) + { + printf( "ERROR\n" ) ; + }*/ + //assert( i < size ) ; + /*if ( i >= size ) + { + fprintf( stderr, "%s: Access out of the vector.\n", __func__ ) ; + exit( 1 ) ; + }*/ + return s[i] ; + } + + // Return how many element left. + int Remove( int ind ) + { + int i ; + if ( ind >= size ) + { + fprintf( stderr, "%s: Access out of the vector.\n", __func__ ) ; + exit( 1 ) ; + } + + //if ( size == 1 ) + // return 0 ; + for ( i = ind ; i < size - 1 ; ++i ) + s[i] = s[i + 1] ; + --size ; + return size ; + } + + // Allocate less memory. + int Shrink() + { + if ( size < capacity / 4 ) + { + capacity /= 2 ; + inc = capacity ; + if ( inc > maxInc ) + inc = maxInc ; + s = (T *)realloc( s, sizeof( T ) * capacity ) ; + } + return capacity ; + } + + void Clear() + { + size = 0 ; + } + + void QSort( int (*compare)(const void*,const void*) ) + { + qsort( s, size, sizeof( T ), compare ) ; + } + + int BinarySearch( const T &v ) + { + int l, r, m ; + l = 0 ; + r = size - 1 ; + + while ( l <= r ) + { + m = ( l + r ) / 2 ; + if ( s[m] == v ) + return m ; + else if ( s[m] < v ) + l = m + 1 ; + else + r = m - 1 ; + + } + return l - 1 ; // Should be between l - 1 and l + } + + void Destroy() + { + if ( s != NULL ) + free( s ) ; + s = NULL ; + size = capacity = 0 ; + inc = 1 ; + } + + void Overwrite( const SimpleVector &in ) + { + if ( s != NULL ) + free( s ) ; + s = NULL ; + if ( in.s != NULL ) + s = (T *)malloc( sizeof( T ) * in.capacity ) ; + size = in.size ; + capacity = in.capacity ; + inc = in.inc ; + int i ; + for ( i = 0 ; i < size ; ++i ) + s[i] = in.s[i] ; + } + + void Reverse() + { + int i, j ; + T tmp ; + for ( i = 0, j = size - 1 ; i < j ; ++i, --j ) + { + tmp = s[j] ; + s[j] = s[i] ; + s[i] = tmp ; + } + } + + // Expand the array by given size. + // Does not care about the value in the new allocated space. + int ExpandBy( int expandSize ) + { + int newSize = size + expandSize ; + if ( newSize <= capacity ) + { + size = newSize ; + } + else + { + //int tmp = capacity ; + capacity = newSize + inc ; + inc *= 2 ; + if ( maxInc > 0 && inc > maxInc ) + inc = maxInc ; + if ( size == 0 ) + s = (T *)malloc( sizeof( T ) * capacity ) ; + else + s = (T *)realloc( s, sizeof( T ) * capacity ) ; + if ( s == NULL ) + { + fprintf( stderr, "%s: Failed to allocate memory.\n", __func__ ) ; + exit( 1 ) ; + } + size = newSize ; + } + return size ; + } + + int ExpandTo( int newSize ) + { + return ExpandBy( newSize - size ) ; + } + + void ShiftRight( int shift ) + { + size = ExpandBy( shift ) ; + int i ; + + for ( i = size - 1 ; i >= shift ; --i ) + s[i] = s[i - shift] ; + return ; + } + + // Set the content to zero in the range + void SetZero( int start, int len ) + { + memset( s + start, 0, sizeof( T ) * len ) ; + } + + T *BeginAddress() + { + return s ; + } + T *EndAddress() + { + return s + size ; + } +} ; + +#endif diff --git a/compactds/SuffixArrayGenerator.hpp b/compactds/SuffixArrayGenerator.hpp new file mode 100644 index 0000000..8d52a13 --- /dev/null +++ b/compactds/SuffixArrayGenerator.hpp @@ -0,0 +1,725 @@ +#ifndef _MOURISL_COMPACTDS_SUFFIXARRAY_GENERATOR +#define _MOURISL_COMPACTDS_SUFFIXARRAY_GENERATOR + +#include + +#include "FixedSizeElemArray.hpp" +#include "DifferenceCover.hpp" + +// The class handle the generation of suffix array by chunks +// The chunk creation is based the sampled difference cover (Algorithm 11.9 is commented out) +namespace compactds { +class SuffixArrayGenerator +{ +private: + size_t _n ; + size_t _space ; + size_t _alphabetSize ; + + // The variables relate to generate the boundaries/_cuts + size_t _b ; + size_t* _cuts ; // The index on T + size_t _cutCnt ; + size_t **_cutLCP ; // self LCP for each cut + + // The variables relate to the difference cover and its ISAs + DifferenceCover _dc ; + size_t *_dcISA ; // The difference cover's index should be compacted when query this ISA + size_t _dcSize ; + + // Relate to cut ============================================ +#if 0 // The commented out codes is for Algorithm 11.9, which might be too slow for very repetitive sequence (i.e: ACGTACGTACGT....), so we have another implementation now + + // The functions relate to generate cut + int SuffixCompareCutString(const FixedSizeElemArray &T, size_t n, size_t i, const FixedSizeElemArray &s, size_t k) + { + if (k == 0) + return 0 ; + else + return T.SubrangeCompare(i, i + k - 1, s, 0, k - 1) ; + } + + // Count the size for each alphabet following current cut string. + // s: the cut string + // k: length of the cut string + void CountCutExtension(const FixedSizeElemArray &T, size_t n, const FixedSizeElemArray &s, size_t k, size_t *alphabetCounts) + { + size_t i ; + memset(alphabetCounts, 0, sizeof(alphabetCounts[0]) * _alphabetSize) ; + for (i = 0 ; i < n - k ; i += downsample) + { + if (!SuffixCompareCutString(T, n, i, s, k)) + alphabetCounts[ T.Read(i + k) ] += downsample ; + } + } + + // s: the cut string + // k: length of the cut string + void ExpandInCut(const FixedSizeElemArray &T, size_t n, FixedSizeElemArray &s, size_t k, size_t *chunkLens) + { + size_t* alphabetCounts = (size_t *)malloc(sizeof(size_t) * _alphabetSize) ; + CountCutExtension(T, n, s, k, alphabetCounts) ; + size_t c ; + for (c = 0 ; c < _alphabetSize ; ++c) + { + s.Write(k, c) ; + if (alphabetCounts[c] <= b) + { + if (chunkLens[_cutCnt - 1] + alphabetCounts[c] > b) + { + _cuts[_cutCnt].InitFromOtherPrefix(s, k + 1) ; + ++_cutCnt ; + chunkLens[_cutCnt - 1] = 0 ; + } + chunkLens[_cutCnt - 1] += alphabetCounts[c] ; + } + else + ExpandInCut(T, n, s, k + 1, chunkLens) ; + } + free(alphabetCounts) ; + } + + size_t GenerateCuts(const FixedSizeElemArray &T, size_t n) + { + size_t m = DIV_CEIL(2 * n, b) ; + _cuts = new FixedSizeElemArray[m + 1] ; + size_t* chunkLens = (size_t *)malloc(sizeof(size_t) * (m + 1)) ; + FixedSizeElemArray s ; + s.Malloc(T.GetElemLength(), _n < _b ? _n : b) ; + + _cuts[0].Malloc(T.GetElemLength(), 0) ; + chunkLens[0] = 0 ; + _cutCnt = 1 ; + ExpandInCut(T, n, s, 0, chunkLens) ; + _cuts[_cutCnt].Malloc(T.GetElemLength(), 0) ; + printf("%d\n", _cuts[1].GetSize()) ; + free(chunkLens) ; + return _cutCnt ; + } +#endif + + // TODO: handle the case where we don't generate difference cover + size_t GenerateCuts(size_t *_dcSA) + { + size_t i ; + size_t blockCnt = DIV_CEIL(_n, _b) ; + size_t stride = DIV_CEIL(_dcSize, blockCnt) ; + blockCnt = DIV_CEIL(_dcSize, stride) ; + _cutCnt = blockCnt ; + _cuts = (size_t *)malloc(sizeof(size_t) * (_cutCnt + 1)) ; + _cuts[0] = _n ; + for (i = stride ; i < _dcSize ; i += stride) + { + //printf("_cuts %d = %d\n", i / stride, _dcSA[i]) ; + _cuts[i / stride] = _dcSA[i] ; + } + _cuts[blockCnt] = _n ; + return _cutCnt ; + } + + // For each cut s, compute LCP(s, s[i:]) for i <= maxSize + void ComputeCutLCP(const FixedSizeElemArray &T, size_t n, size_t maxSize) + { + size_t i, j, l ; + _cutLCP = (size_t **)malloc(sizeof(*_cutLCP) * _cutCnt) ; + for (i = 0 ; i < _cutCnt ; ++i) + { + size_t jopenend = n - _cuts[i] ; + if (jopenend > maxSize ) + jopenend = maxSize ; + _cutLCP[i] = (size_t *)malloc(sizeof(_cutLCP[i][0]) * jopenend) ; + if (jopenend == 0) + continue ; + _cutLCP[i][0] = jopenend ; + for (j = 1 ; j < jopenend ; ++j) + { + for (l = 0 ; l < jopenend - j ; ++l) + { + if (T.Read(_cuts[i] + l) != T.Read(_cuts[i] + j + l)) + break ; + } + _cutLCP[i][j] = l ; + } + } + } + + // Compare the T[i,...] with a cut ci, and adjust other auxiliary data relating + // rightmosti: the start position corresponding to the rightmost j + // rightj: the rightmost end position + // The auxiliary data is for reusing some information from the + // preiouv T[i-1,...] comparison + // return: sign represent T[i:n]-cut . + int CompareCutUsingCutLCP(const FixedSizeElemArray &T, size_t n, size_t i, size_t ci, + size_t &rightmosti, size_t &rightmostj) + { + size_t j ; // position on T + size_t cpos ; // position on cut + size_t overlap = 0 ; + size_t cutLen = n - _cuts[ci] ; + if (i == _cuts[ci]) + return 0 ; // return 0 for equal + //if (i == 15 && ci == 6) + // printf("> %d %d %d\n", i, rightmosti, rightmostj) ; + + if (cutLen > (size_t)_dc.GetV()) + cutLen = _dc.GetV() ; + if (rightmostj > 0 && i <= rightmostj) // i<=rightmostj? + { + overlap = _cutLCP[ci][i - rightmosti] ; + if (rightmostj <= i + overlap - 1) // we may need to update the range + { + /*for (j = rightmostj + 1, cpos = rightmostj - i + 1 ; + j < n && cpos < cutLen; ++j, ++cpos) + { + if (T.Read(j) != T.Read(_cuts[ci] + cpos)) + break ; + } + --j ; --cpos ; + rightmostj = j ; + rightmosti = i ;*/ + size_t localMatchCnt = T.PrefixMatchLen(rightmostj + 1, n - 1, + T, _cuts[ci] + rightmostj - i + 1, _cuts[ci] + cutLen - 1) ; + rightmosti = i ; + j = (rightmostj + 1) + localMatchCnt - 1 ; + rightmostj = j ; + cpos = j - i ; + } + else + { + return (int)T.Read(i + overlap) - T.Read(_cuts[ci] + overlap) ; + } + } + else + { + /*for (j = i, cpos = 0 ; j < n && cpos < cutLen ; ++j, ++cpos) + if (T.Read(j) != T.Read(_cuts[ci] + cpos)) + break ; + //printf("%d %d %d. %d\n", i, ci, _cuts[ci], cpos) ; + if (cpos == 0) + return T.Read(j) - T.Read(_cuts[ci] + cpos) ; + + --j; --cpos ; + rightmostj = j ; + rightmosti = i ;*/ + + size_t localMatchCnt = T.PrefixMatchLen(i, n - 1, + T, _cuts[ci], _cuts[ci] + cutLen - 1) ; + if (localMatchCnt == 0) + return T.Read(i) - T.Read(_cuts[ci]) ; + j = i + localMatchCnt - 1 ; + rightmostj = j ; + cpos = localMatchCnt - 1 ; + rightmosti = i ; + } + + if (j == n - 1 && cpos == n - _cuts[ci] - 1) + return 0 ; + else if (j == n - 1) + return -1 ; + else if (cpos == n - _cuts[ci] - 1) + return 1 ; + else + { + int cmp = T.Read(j + 1) - T.Read(_cuts[ci] + cpos + 1) ; + if (cmp == 0) + return CompareSuffixWithDC(i, _cuts[ci], n) ; + else + return cmp ; + } + } + + // Relate to suffix sorting ============================================ + void Swap(size_t &a, size_t &b) + { + size_t tmp ; + tmp = a ; a = b ; b = tmp ; + } + + // Compare T[a:] and T[b:] directly with DC, which assumes their first + // v prefix are matched + // @return: sign(T[a:]-T[b:]) + int CompareSuffixWithDC(size_t a, size_t b, size_t n) + { + if (a == b) + return 0 ; + int delta = _dc.Delta(a, b) ; + int compare = 0 ; + if (a + delta >= n) + compare = 1 ; + else if (b + delta >= n) + compare = -1 ; + else + { + size_t aisa = _dcISA[ _dc.CompactIndex(a + delta) ] ; + size_t bisa = _dcISA[ _dc.CompactIndex(b + delta) ] ; + if (aisa < bisa) + compare = -1 ; + else + compare = 1 ; + } + return compare ; + } + + // Use difference cover to quick sort the SA. We don't need to pass T now + void QSortWithDC(size_t *sa, size_t m, size_t s, size_t e, size_t n) + { + if (s >= e) + return ; + // Partition + Swap(sa[s], sa[(s + e)/2]) ; + size_t pivot = sa[s] ; // pivot is the median element + size_t pi, pj ; // partiation indexes + pi = s + 1; // pi points to the current process element + pj = e + 1 ; // pj points the first element of the second chunk + while (pi < pj) + { + int comparePivot = CompareSuffixWithDC(sa[pi], pivot, n) ; + + if (comparePivot < 0) + ++pi ; + else + { + Swap(sa[pi], sa[pj - 1]) ; + --pj ; + } + } + Swap(sa[s], sa[pi - 1]) ; + if (pi > 2) + QSortWithDC(sa, m, s, pi - 2, n) ; + QSortWithDC(sa, m, pi, e, n) ; + } + + // Sort T[s..e], and only consider the positions in sa of size m + // s, e: the range for sa + // d: the preifx already matched in T[s..e], kind of as depth. + // dcStrategy: how to use the difference cover. 0-no _dc, 1-use _dc, 2-return when reach _dcv + void MultikeyQSort(const FixedSizeElemArray &T, size_t n, size_t *sa, size_t m, size_t s, size_t e, size_t d, int dcStrategy, size_t *alphabetCounts) + { + if (s >= e) + return ; + if (dcStrategy != 0 && d >= (size_t)_dc.GetV()) + { + if (dcStrategy == 2) + return ; + else if (dcStrategy == 1) + { + // We now use compare the suffix using difference cover + QSortWithDC(sa, m, s, e, n) ; + return ; + } + } + + size_t i ; + size_t tmp ; + // Find pivot + size_t pivot = 0 ; + + // quick check whether every suffix is the same using blocks + const int alphabetBits = T.GetElemLength() ; + const int block = WORDBITS / alphabetBits ; + while (1) + { + if (dcStrategy != 0 && d >= (size_t)_dc.GetV()) + break ; + bool passEnd = false ; // any suffix pass the end of the T + WORD foundw = 0 ; + if (sa[s] + d + block - 1 < n) + foundw = T.PackRead(sa[s] + d, block) ; + else + passEnd = true ; + for (i = s + 1 ; i <= e ; ++i) + { + if (sa[i] + d + block - 1 < n) + { + WORD w = T.PackRead(sa[i] + d, block) ; + if (w != foundw) + break ; + } + else + { + if (passEnd == false) + break ; + } + } + if (i > e) + d += block ; + else + break ; + } + + // Real search + while (1) + { + if (dcStrategy != 0 && d >= (size_t)_dc.GetV()) + break ; + + memset(alphabetCounts, 0, sizeof(alphabetCounts[0]) * (_alphabetSize + 1)) ; + for (i = s ; i <= e ; ++i) + { + if (sa[i] + d < n) + ++alphabetCounts[T.Read(sa[i] + d) + 1] ; + else + ++alphabetCounts[0] ; + } + if (alphabetCounts[0] == e - s + 1) + return ; + tmp = 0 ; + for (i = 0 ; i <= _alphabetSize ; ++i) + { + if (alphabetCounts[i] > 0) + ++tmp ; + } + if (tmp == 1) // the next character is the same for all the suffixes in the range + { + ++d ; + continue ; + } + + tmp = 0 ; + for (i = 0 ; i <= _alphabetSize ; ++i) + { + tmp += alphabetCounts[i] ; + if (tmp >= (e - s + 1) / 2) + break ; + } + pivot = i ; // pivot character + if (pivot > 0) + --pivot ; + break ; + } + + // Partition + size_t pi, pj, pk ; // partiation indexes + pi = s ; // pi points to the first element of the middle chunk + pj = s ; // pj is the current element + pk = e ; // pk points the last element of the middle chunk + while (pj <= pk) + { + int comparePivot = 0 ; + if (sa[pj] + d >= n) + comparePivot = -1 ; + else + { + size_t c = T.Read(sa[pj] + d) ; + if (c < pivot) + comparePivot = -1 ; + else if (c == pivot) + comparePivot = 0 ; + else + comparePivot = 1 ; + } + + if (comparePivot == -1) + { + Swap(sa[pi], sa[pj]) ; + ++pi ; ++pj ; + } + else if (comparePivot == 1) + { + Swap(sa[pj], sa[pk]) ; + if (pk == 0) + break ; + --pk ; + } + else + ++pj ; + } + + // Recursive sorting + if (pi >= 1) + MultikeyQSort(T, n, sa, m, s, pi - 1, d, dcStrategy, alphabetCounts) ; + MultikeyQSort(T, n, sa, m, pi, pk, d + 1, dcStrategy, alphabetCounts) ; + MultikeyQSort(T, n, sa, m, pj, e, d, dcStrategy, alphabetCounts) ; + } + + // Sort the suffixes in the difference cover + // @return: the suffix array of the difference cover + size_t *SortSuffixInDC(const FixedSizeElemArray &T, size_t n) + { + size_t i, k ; + size_t *sa ; + size_t *rank ; // rank of a suffix consider the prefix of size k, allowing ties + size_t *nextBuffer ; // buffer for next iteration (double expanded) information + size_t *count ; // cumulative rank count + size_t maxRank ; // distinct ranks + size_t dci ; // compacted difference cover index + int v = _dc.GetV() ; + + sa = (size_t *)malloc(sizeof(size_t) * _dcSize) ; + rank = (size_t *)malloc(sizeof(size_t) * _dcSize) ; + nextBuffer = (size_t *)malloc(sizeof(size_t) * _dcSize) ; + count = (size_t *)malloc(sizeof(size_t) * _dcSize) ; + + // Sort by their first v characters + _dc.GetDiffCoverList(n, sa) ; + size_t *alphabetCounts = (size_t *)malloc(sizeof(size_t) * (_alphabetSize + 1)) ; + MultikeyQSort(T, n, sa, _dcSize, 0, _dcSize - 1, 0, /*dcStrategy=*/2, alphabetCounts) ; + free(alphabetCounts) ; + + maxRank = 0 ; + count[0] = 0 ; + for (i = 0 ; i < _dcSize ; ++i) + { + if (i > 0 && T.SubrangeCompare( sa[i - 1], sa[i - 1] + v - 1, T, sa[i], sa[i] + v - 1)) + { + ++maxRank ; + count[maxRank] = 0 ; + } + dci = _dc.CompactIndex( sa[i] ) ; + rank[dci] = maxRank ; + ++count[maxRank] ; + } + for (i = 1 ; i <= maxRank ; ++i) + count[i] += count[i - 1] ; + + // Sorting difference cover using Manber-Myers algorithm + size_t *tmpSwap ; + for (k = v ; k < n /*&& maxRank < _dcSize - 1*/ ; k <<= 1) + { + // Get the new SA, nextBuffer serves as the next SA + size_t ri ; // reverse i + for (ri = 0 ; ri < _dcSize; ++ri) + { + i = _dcSize - 1 - ri ; + + dci = _dc.CompactIndex( sa[i] ) ; + if (sa[i] >= n - k) + nextBuffer[i] = sa[i] ; + if (sa[i] < k) + continue ; + size_t kbeforeDci = _dc.CompactIndex(sa[i] - k) ; + nextBuffer[ count[ rank[kbeforeDci] ] - 1 ] = sa[i] - k ; + --count[ rank[kbeforeDci] ] ; + } + //memcpy(sa, nextBuffer, sizeof(sa[0]) * _dcSize) ; + tmpSwap = sa ; + sa = nextBuffer ; + nextBuffer = tmpSwap ; + + // Update the rank, nextBuffer serves as the next rank. + maxRank = 0 ; + count[0] = 0 ; + size_t prevDci = 0 ; + for (i = 0 ; i < _dcSize ; ++i) + { + dci = _dc.CompactIndex(sa[i]) ; + if (i > 0 && + (rank[dci] != rank[prevDci] || + sa[i - 1] + k >= n || sa[i] + k >= n || + rank[ _dc.CompactIndex(sa[i - 1] + k)] != rank[_dc.CompactIndex(sa[i] + k)])) + { + ++maxRank ; + count[maxRank] = 0 ; + } + nextBuffer[dci] = maxRank ; + ++count[maxRank] ; + prevDci = dci ; + } + tmpSwap = rank ; + rank = nextBuffer ; + nextBuffer = tmpSwap ; + + if (maxRank >= _dcSize - 1) + break ; + + for (i = 1 ; i <= maxRank ; ++i) + count[i] += count[i - 1] ; + } + + free(nextBuffer) ; + free(count) ; + + _dcISA = rank ; + return sa ; + } + +public: + SuffixArrayGenerator() + { + _b = 1<<24 ; // 2^24, 16MB block size by default + _n = _space = 0 ; + _cuts = NULL ; + _dcISA = NULL ; + } + + ~SuffixArrayGenerator() + { + Free() ; + } + + void Free() + { + _space = 0 ; + + if (_cuts != NULL) + { + free(_cuts) ; + if (_cutLCP != NULL) + { + size_t i ; + for (i = 0 ; i < _cutCnt ; ++i) + free(_cutLCP[i]) ; + free(_cutLCP) ; + } + } + if (_dcISA != NULL) + free(_dcISA) ; + } + + size_t GetSpace() + { + return _space + sizeof(*this) ; + } + + // Initialize the generator to obtain the _cuts + // _dcv: difference cover period + // @return: the number of _cuts + size_t Init(const FixedSizeElemArray &T, size_t n, size_t b, int dcv, int alphabetSize) + { + this->_n = n ; + if (b > 0) + this->_b = b ; + this->_alphabetSize = alphabetSize ; + _dc.Init(dcv) ; + _dcSize = _dc.GetSize(n) ; + size_t *dcSA = SortSuffixInDC(T, n) ; + + GenerateCuts(dcSA) ; + ComputeCutLCP(T, n, dcv) ; + free(dcSA) ; + return _cutCnt ; + } + + size_t GetChunkCount() + { + return _cutCnt ; + } + + // Generate the from-th chunk to to-th chunk for T[s..e], both are inclusive + // Each chunk is left close, right open for the cut. + // The procedure utilized _cutLCP to expediate the search. + void GetChunksPositions(const FixedSizeElemArray &T, size_t n, size_t from, size_t to, size_t s, size_t e, std::vector< std::vector > &pos) + { + size_t i, j ; + if (to >= _cutCnt) + to = _cutCnt - 1 ; + if (e >= n) + e = n - 1 ; + std::vector< std::vector >().swap(pos) ; + for (j = from ; j <= to ; ++j) + pos.push_back( std::vector() ) ; + + size_t *rightmosti = (size_t *)calloc(to - from + 2, sizeof(size_t)) ; + size_t *rightmostj = (size_t *)calloc(to - from + 2, sizeof(size_t)) ; + for (i = s ; i <= e ; ++i) + { + if ((from == 0 || CompareCutUsingCutLCP(T, n, i, from, rightmosti[from - from], rightmostj[from - from]) >= 0) + && (to == _cutCnt - 1 + || CompareCutUsingCutLCP(T, n, i, to + 1, rightmosti[to + 1 - from], rightmostj[to + 1 - from]) < 0)) + { + for (j = from + 1 ; j <= to ; ++j) + { + if (CompareCutUsingCutLCP(T, n, i, j, rightmosti[j-from], rightmostj[j-from]) < 0) + break ; + } + pos[(j-1) - from].push_back(i) ; + } + } + free(rightmosti) ; + free(rightmostj) ; + } + + void SortSuffixByPos(const FixedSizeElemArray &T, size_t n, size_t *pos, size_t m, size_t *sa) + { + if (m == 0) + return ; + size_t i ; + if (sa != pos) + for (i = 0 ; i < m ; ++i) + sa[i] = pos[i] ; + size_t *alphabetCounts = (size_t *)malloc(sizeof(size_t) * (_alphabetSize + 1)) ; + MultikeyQSort(T, n, sa, m, 0, m - 1, 0, /*dcStrategy=*/1, alphabetCounts) ; + free(alphabetCounts) ; + } + + // Functions relating to use disk to hold chunks + // Output each chunk to prefix_{xxx}.chunk file + void OutputChunksToFiles(char *prefix) + { + int i ; + char filename[1024] ; + FILE **fps ; + fps = (FILE **)malloc(sizeof(FILE*) * _cutCnt) ; + for (i = 0 ; i < (int)_cutCnt ; ++i) + { + sprintf(filename, "%s_%d.chunk", prefix, i) ; + fps[i] = fopen(filename, "w") ; + } + + for (i = 0 ; i < (int)_cutCnt ; ++i) + fclose(fps[i]) ; + free(fps) ; + } + + // Read in the i-th chunk file + void ReadChunkFile(char *prefix, int i, std::vector &pos) + { + char filename[1024] ; + sprintf(filename, "%s_%d.chunk", prefix, i) ; + FILE *fp = fopen(filename, "r") ; + + fclose(fp) ; + } + + // Remove all the temporary chunk files + void CleanChunkFiles(char *prefix) + { + char filename[1024] ; + for (int i = 0 ; i < (int)_cutCnt ; ++i) + { + sprintf(filename, "%s_%d.chunk", prefix, i) ; + } + } + + // Use the Theorem 2 from "Fast Lightweight Suffix Array Construction and Checking" + // The simpler implementation requiring creating ISA, so it is memory intensive + bool ValidateSA(const FixedSizeElemArray &T, size_t n, size_t *sa) + { + size_t i ; + size_t *isa = (size_t *)malloc(sizeof(size_t) * n) ; + for (i = 0 ; i < n ; ++i) + { + if (sa[i] >= n) + return false ; + if (i > 0) + { + if (sa[i - 1] == sa[i]) + return false ; + if (T.Read(sa[i - 1]) > T.Read(sa[i])) + return false ; + } + } + for (i = 0 ; i < n ; ++i) + isa[sa[i]] = i ; + + for (i = 1 ; i < n ; ++i) + { + if (T.Read(sa[i - 1]) == T.Read(sa[i])) + { + if (sa[i-1] + 1 < n && sa[i] + 1 < n) + { + if (isa[ sa[i - 1] + 1] > isa[ sa[i] + 1]) + return false ; + } + else if (sa[i] + 1 == n) // only the previous can be followed by the end of the string + return false ; + } + } + + free(isa) ; + return true ; + } +} ; +} + +#endif diff --git a/compactds/Tree.hpp b/compactds/Tree.hpp new file mode 100644 index 0000000..34063ab --- /dev/null +++ b/compactds/Tree.hpp @@ -0,0 +1,167 @@ +#ifndef _MOURISL_COMPACTDS_TREE +#define _MOURISL_COMPACTDS_TREE + +#include "Utils.hpp" + +namespace compactds { + +class Tree +{ +protected: + size_t _space ; + size_t _n ; // number of nodes in tree + +public: + Tree() + { + _space = 0 ; + _n = 0 ; + } + ~Tree() {} + + virtual size_t GetSpace(bool inclusive) = 0 ; + + virtual size_t Root() const = 0 ; + virtual size_t ChildSelect(size_t v, size_t t) const = 0 ; // Get the t-th (1-based) child of v + virtual size_t FirstChild(size_t v) const = 0 ; + virtual size_t LastChild(size_t v) const = 0 ; + virtual size_t ChildrenCount(size_t v) const = 0 ; + virtual size_t ChildRank(size_t v) const = 0 ; // Rank is always 1-based + + virtual size_t NextSibling(size_t v) const = 0 ; + virtual size_t PrevSibling(size_t v) const = 0 ; + + virtual size_t Parent(size_t v) const = 0 ; + virtual bool IsLeaf(size_t v) const = 0 ; + + virtual size_t NodeMap(size_t v) const = 0 ; + virtual size_t NodeSelect(size_t i) const = 0 ; + + // Whether u is an ancestor of v. + virtual bool IsAncestor(size_t u, size_t v) const + { + size_t p = v ; + while (p != Root() && p != u) + p = Parent(p) ; + if (p == u) + return true ; + else + return false ; + } + + virtual size_t Depth(size_t v) const + { + if (v == Root()) + return 0 ; + size_t ret = 1 ; + size_t p = Parent(v) ; + while (p != Root()) + { + p = Parent(p) ; + ++ret ; + } + return ret ; + } + + virtual size_t LeafCountInSubTree(size_t v) const + { + if (IsLeaf(v)) + return 1 ; + size_t i ; + size_t childCnt = ChildrenCount(v) ; + size_t ret = 0 ; + for (i = 0 ; i < childCnt ; ++i) + ret += LeafCountInSubTree( ChildSelect(v, i + 1) ) ; + return ret ; + } + + virtual size_t SubTreeSize(size_t v) const + { + if (IsLeaf(v)) + return 1 ; + size_t i ; + size_t childCnt = ChildrenCount(v) ; + size_t ret = 0 ; + for (i = 0 ; i < childCnt ; ++i) + ret += SubTreeSize( ChildSelect(v, i + 1) ) ; + return ret + 1 ; + } + + virtual bool IsFirstChild(size_t v) const + { + if (v == Root()) + return true ; + if (ChildRank(v) == 1) + return true ; + return false ; + } + + virtual bool IsLastChild(size_t v) const + { + if (v == Root()) + return true ; + + size_t p = Parent(v) ; + size_t pChildCnt = ChildrenCount(p) ; + if (ChildRank(v) == pChildCnt) + return true ; + return false ; + } + + virtual size_t LCA(size_t u, size_t v) const + { + SimpleVector upath ; + SimpleVector vpath ; + + size_t p ; + + upath.PushBack(u) ; + p = Parent(u) ; + while (p != 0) + { + upath.PushBack(p) ; + p = Parent(p) ; + } + upath.PushBack(0) ; + + vpath.PushBack(v) ; + p = Parent(v) ; + while (p != 0) + { + vpath.PushBack(p) ; + p = Parent(p) ; + } + vpath.PushBack(0) ; + + upath.Reverse() ; + vpath.Reverse() ; + + size_t size = MIN(upath.Size(), vpath.Size()) ; + size_t i ; + for (i = 0 ; i < size; ++i) + if (upath[i] != vpath[i]) + break ; + return upath[i - 1] ; + } + + size_t GetSize() const + { + return _n ; + } + + virtual void Save(FILE *fp) + { + SAVE_VAR(fp, _space) ; + SAVE_VAR(fp, _n) ; + } + + virtual void Load(FILE *fp) + { + LOAD_VAR(fp, _space) ; + LOAD_VAR(fp, _n) ; + } +} ; + +} // end of namespace + +#endif diff --git a/compactds/Tree_BP.hpp b/compactds/Tree_BP.hpp new file mode 100644 index 0000000..67e9e59 --- /dev/null +++ b/compactds/Tree_BP.hpp @@ -0,0 +1,316 @@ +#ifndef _MOURISL_COMPACTDS_TREE_BP +#define _MOURISL_COMPACTDS_TREE_BP + +// Represent a tree by balanced parenthesis (Chapter 8.2) +// Each v points to a parenthesis like (...) containing the substree +// The implementation details might be different as we are using 0-index +// tree node id (i) as well. +// In the implementation, v is for the index on the encoded bitvector B. + +#include "Tree.hpp" +#include "Tree_Plain.hpp" +#include "Tree_Cardinal_Plain.hpp" +#include "Bitvector_Plain.hpp" +#include "DS_Parenthesis.hpp" + +namespace compactds { +class Tree_BP: public Tree +{ +private: + Bitvector_Plain _B ; // bits representation of the parenthesis + DS_Parenthesis _bp ; // dangling structure + + // DFS to mark the parenthesis as B + // tag: tree node id. bi: index on B + void Build(const struct _plainTreeNode *treeNodes, size_t n, size_t tag, + size_t *treeIdMap, size_t &visited, size_t &bi) + { + size_t c ; + _B.BitSet(bi) ; + + treeIdMap[tag] = visited ; + ++visited ; + + ++bi ; + for (c = treeNodes[tag].child ; c != 0 ; c = treeNodes[c].sibling) + Build(treeNodes, n, c, treeIdMap, visited, bi) ; + //_B.BitClear(bi) ; // close the parentehsis + ++bi ; + } + + void BuildFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t childCnt, size_t tag, size_t *treeIdMap, size_t &visited, size_t &bi) + { + size_t i ; + _B.BitSet(bi) ; + + treeIdMap[tag] = visited ; + ++visited ; + + ++bi ; + for (i = 0 ; i < childCnt ; ++i) + { + size_t c = treeNodes[tag].children[i] ; + if (c == 0) + continue ; + BuildFromCardinalTree(treeNodes, n, childCnt, c, treeIdMap, visited, bi) ; + } + //_B.BitClear(bi) ; // close the parentehsis + ++bi ; + } +public: + Tree_BP() {} + ~Tree_BP() + { + Free() ; + } + + void Free() + { + if (_n > 0) + { + _B.Free() ; + _bp.Free() ; + _n = 0 ; + } + } + + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + void Init(const struct _plainTreeNode *treeNodes, size_t n, size_t *treeIdMap) + { + _n = n ; + _B.Malloc(2 * n) ; + + size_t bi = 0 ; + size_t visited = 0 ; + Build(treeNodes, n, 0, treeIdMap, visited, bi) ; + + _B.Init() ; + + // the last 2,2 is for pattern 10 as "()" + // Note that due to our reading the bits from low to high, "1" will be the first bit + // In other word, order is reversed. + _bp.Init(_B.GetData(), 2 * _n, 1, 2) ; + + _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ; + } + + void InitFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t childCount, size_t *treeIdMap) + { + _n = n ; + _B.Malloc(2 * n) ; + + size_t bi = 0 ; + size_t visited = 0 ; + BuildFromCardinalTree(treeNodes, n, childCount, 0, treeIdMap, visited, bi) ; + + _B.Init() ; + + // the last 2,2 is for pattern 10 as "()" + // Note that due to our reading the bits from low to high, "1" will be the first bit + // In other word, order is reversed. + _bp.Init(_B.GetData(), 2 * _n, 1, 2) ; + + _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ; + } + // The index in B + size_t Root() const + { + return 0 ; + } + + // @return: the t-th child (1-based) of node v in B vector + size_t ChildSelect(size_t v, size_t t) const + { + return _bp.Open(_bp.GetRmmTree().MinSelect(v + 1, _bp.Close(v, _B.GetData(), 2 * _n) - 1, + t, _B.GetData(), 2 * _n), _B.GetData(), 2*_n) ; + } + + size_t FirstChild(size_t v) const + { + return v + 1 ; + } + + // Parenthesis like + // (...(...)) + // | | + // v lastchild + size_t LastChild(size_t v) const + { + return _bp.Open(_bp.Close(v, _B.GetData(), 2*_n)-1, _B.GetData(), 2*_n) ; + } + + size_t ChildrenCount(size_t v) const + { + if (IsLeaf(v)) + return 0 ; + // Each child's (...) has excess 0 after the end. + return _bp.GetRmmTree().MinCount(v + 1, _bp.Close(v, _B.GetData(), 2*_n) - 1, + _B.GetData(), 2*_n) ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + if (v == Root()) + return 0 ; + size_t p = Parent(v) ; + if (p + 1 == v) + return 1 ; + return _bp.GetRmmTree().MinCount(p + 1, v - 1, _B.GetData(), 2*_n) + 1 ; + } + + // The silbing function assumes v has + // those siblings. + size_t NextSibling(size_t v) const + { + return _bp.Close(v, _B.GetData(), 2*_n) + 1 ; + } + + size_t PrevSibling(size_t v) const + { + return _bp.Open(v - 1, _B.GetData(), 2*_n) ; + } + + size_t Parent(size_t v) const + { + if (v == Root()) + return 0 ; + return _bp.Enclose(v, _B.GetData(), 2*_n) ; + } + + bool IsLeaf(size_t v) const + { + if (_B.Access(v + 1) == 0) + return true ; + return false ; + } + + size_t LCA(size_t u, size_t v) const + { + if (u > v) + { + size_t tmp = u ; + u = v ; + v = tmp ; + } + + //printf("%d %d: %d %d\n", u, v, _bp.GetRmmTree().Rmq(u, v, _B.GetData(), 2*_n) + 1, + // _bp.Enclose(_bp.GetRmmTree().Rmq(u, v, _B.GetData(), 2*_n) + 1, _B.GetData(), 2*_n)) ; + return _bp.Enclose(_bp.GetRmmTree().Rmq(u, v, _B.GetData(), 2*_n) + 1, _B.GetData(), 2*_n) ; + } + + // Maps index in B (v) back up to the actual node id + // Pre-order + size_t NodeMap(size_t v) const + { + return _B.Rank(1, v, /*inclusive=*/0) ; + } + + //Map actual node id to index in B (v). + // Pre-order Select + size_t NodeSelect(size_t i) const + { + return _B.Select(1, i + 1) ; + } + + size_t PostOrder(size_t v) const + { + return _B.Rank(0, _bp.Close(v, _B.GetData(), 2 * _n), /*inclusive*/0) ; + } + + size_t PostOrderSelect(size_t i) const + { + return _bp.Open(_B.Select(0, i + 1), _B.GetData(), 2 * _n) ; + } + + // Root has depth 0 + size_t Depth(size_t v) const + { + // Kind of excess + // inclusive=0 means rank-1 here + return 2 * _B.Rank(1, v, /*inclusive=*/0) - v ; + } + + // # of nodes in the substree, inclusive. + size_t SubTreeSize(size_t v) const + { + return (_bp.Close(v, _B.GetData(), 2 * _n) - v + 1) / 2 ; + } + + // Whether u is an ancestor of v. + bool IsAncestor(size_t u, size_t v) const + { + size_t uclose = _bp.Close(u, _B.GetData(), 2 * _n) ; + if (u <= v && v <= uclose) + return true ; + return false ; + } + + // The ancestor at d levels above + size_t LevelAncestor(size_t v, int64_t d) const + { + return _bp.GetRmmTree().BwdSearch(v, -d, _B.GetData(), 2 * _n) ; + } + + // Would be v it self if v is the leaf. + size_t DeepestNode(size_t v) const + { + return _bp.GetRmmTree().RMq(v, _bp.Close(v, _B.GetData(), 2 * _n), _B.GetData(), 2 * _n) ; + } + + // The distance from v to the deepest leaf + // 0 if v is the leaf + size_t Height(size_t v) const + { + size_t depthv = Depth(v) ; + size_t depthc = Depth( DeepestNode(v) ) ; + return depthc - depthv ; + } + + // Number of leaves in the subtree of v + size_t LeafCountInSubTree(size_t v) const + { + if (IsLeaf(v)) + return 1 ; + else + { + // Since close(v) is ")", the LeafRank(close(v)) is automatically exclusive + return LeafRank( _bp.Close(v, _B.GetData(), 2 *_n)) - LeafRank(v) ; + } + } + + // Rank and select with respect to leaves in _B order + size_t LeafRank(size_t v, int inclusive = 1) const + { + return _bp.PatternRank(v, _B.GetData(), 2*_n, inclusive) ; + } + + size_t LeafSelect(size_t i) const + { + return _bp.PatternSelect(i, _B.GetData(), 2*_n) ; + } + + void Save(FILE *fp) + { + Tree::Save(fp) ; + _B.Save(fp) ; + _bp.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + Tree::Load(fp) ; + + _B.Load(fp) ; + _bp.Load(fp) ; + } +} ; + +} // end of name space + +#endif diff --git a/compactds/Tree_Cardinal.hpp b/compactds/Tree_Cardinal.hpp new file mode 100644 index 0000000..a260351 --- /dev/null +++ b/compactds/Tree_Cardinal.hpp @@ -0,0 +1,43 @@ +#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL +#define _MOURISL_COMPACTDS_TREE_CARDINAL + +#include "Utils.hpp" + +namespace compactds { + +class Tree_Cardinal: public Tree +{ +protected: + size_t _c ; // cardinality (number of max children) +public: + Tree_Cardinal() + { + _space = 0 ; + _n = 0 ; + _c = 0 ; + } + ~Tree_Cardinal() {} + + // Number of children with label l. 1: has such children. 0-don't + virtual size_t ChildrenLabeled(size_t v, size_t l) const = 0 ; + // The child with label l. + virtual size_t LabeledChild(size_t v, size_t l) const = 0 ; + // The label of the edge that leads to node v. + virtual size_t ChildLabel(size_t v) const = 0 ; + + virtual void Save(FILE *fp) + { + Tree::Save(fp) ; + SAVE_VAR(fp, _c) ; + } + + virtual void Load(FILE *fp) + { + Tree::Load(fp) ; + LOAD_VAR(fp, _c) ; + } +} ; + +} // end of namespace + +#endif diff --git a/compactds/Tree_Cardinal_LOUDS.hpp b/compactds/Tree_Cardinal_LOUDS.hpp new file mode 100644 index 0000000..67d0b75 --- /dev/null +++ b/compactds/Tree_Cardinal_LOUDS.hpp @@ -0,0 +1,203 @@ +#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL_LOUDS +#define _MOURISL_COMPACTDS_TREE_CARDINAL_LOUDS + +// Level-order Unary degree sequence for cardinal tree (Chapter 8.1.1, Algorithm 8.4) +// The implementation details might be different as we are using 0-index +// tree node id (i) as well. +// In the implementation, v is for the index on the encoded bitvector B +// to be consistent with the genral LOUDS represent. This is DIFFERENT +// from the textbook. + +#include "Tree_Cardinal.hpp" +#include "Tree_Cardinal_Plain.hpp" +#include "Bitvector_Plain.hpp" + +namespace compactds { +template +class Tree_Cardinal_LOUDS: public Tree_Cardinal +{ +private: + BvClass _B ; +public: + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + // Use c bits per node. + void Init(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t c, size_t *treeIdMap) + { + size_t i ; + WORD *W = Utils::MallocByBits(c * n) ; + + _n = n ; + _c = c ; + + // BFS on tree nodes + // The algorithm 8.3 will change the original plain tree + size_t *queue = (size_t *)malloc(sizeof(size_t) * n) ; + size_t qhead, qtail ; + queue[0] = 0 ; + qhead = 0 ; + qtail = 1 ; + while (qhead < qtail) + { + size_t node = queue[qhead] ; + if (treeIdMap != NULL) + treeIdMap[node] = qhead ; + + ++qhead ; + + for (i = 0 ; i < _c ; ++i) + { + if (treeNodes[node].children[i] != 0) + { + Utils::BitSet(W, _c * (qhead - 1) + i) ; + queue[qtail] = treeNodes[node].children[i] ; + ++qtail ; + } + } + } + free(queue) ; + + _B.Init(W, c * n) ; + _space = _B.GetSpace() - sizeof(_B); + free(W) ; + } + + // The index in B + size_t Root() const + { + return 0 ; + } + + // @return: the t-th child (0-based) of node v in B vector + size_t ChildSelect(size_t v, size_t t) const + { + return (_B.Rank(1, v, /*inclusive=*/0) + t) * _c ; + } + + size_t FirstChild(size_t v) const + { + return (_B.Rank(1, v, /*inclusive=*/0) + 1) * _c ; + } + + size_t LastChild(size_t v) const + { + return (_B.Rank(1, v + _c - 1)) * _c ; + } + + size_t ChildrenCount(size_t v) const + { + return _B.Rank(1, v + _c - 1) - _B.Rank(1, v, 0) ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + if ( v == Root()) + return 0 ; + size_t tid = NodeMap(v) ; + size_t j = _B.Select(1, tid) ; // edge from parent to v + return _B.Rank(1, j) - _B.Rank(1, j/_c * _c, /*inclusive=*/0) ; + } + + // The silbing function assumes v has + // those siblings. + size_t NextSibling(size_t v) const + { + return v + 1 ; + } + + size_t PrevSibling(size_t v) const + { + return v - 1 ; + } + + size_t Parent(size_t v) const + { + if (v == 0) + return Root() ; + else + { + size_t tid = NodeMap(v) ; + // _B.Select(1, tid) identify the edge from the parent to v + // Notice that even though tree node id is 0-based, the edge starts + // to correspnds to node id 1 + return (_B.Select(1, tid) / _c) * _c ; + } + } + + bool IsLeaf(size_t v) const + { + return ChildrenCount(v) == 0 ; + } + + size_t LCA(size_t u, size_t v) const + { + while (u != v) + { + if (u > v) + u = Parent(u) ; + else + v = Parent(v) ; + } + return u ; + } + + // Maps index in B (v) back up to the actual node id + size_t NodeMap(size_t v) const + { + return v / _c ; + } + + //Map actual node id to index in B (v). + size_t NodeSelect(size_t i) const + { + if (i == 0) + return Root() ; + else + return i * _c ; + } + + // Number of children with label l. 1: has such children. 0-don't + size_t ChildrenLabeled(size_t v, size_t l) const + { + return (_B.Access(v + l) == 1) ? 1 : 0 ; + } + + // The childr with label l. + // Assuming label l's child exist + // Notice the difference from Child() + size_t LabeledChild(size_t v, size_t l) const + { + return (_B.Rank(1, v + l, /*inclusive=*/1)) * _c ; + } + + // The label of the edge that leads to node v. + size_t ChildLabel(size_t v) const + { + if (v == Root()) + return 0 ; + size_t tid = NodeMap(v) ; + size_t j = _B.Select(1, tid) ; // edge from parent to v + return j % _c ; + } + + void Save(FILE *fp) + { + Tree_Cardinal::Save(fp) ; + _B.Save(fp) ; + } + + void Load(FILE *fp) + { + Tree_Cardinal::Load(fp) ; + _B.Load(fp) ; + } + +} ; + +} // end of name space + +#endif diff --git a/compactds/Tree_Cardinal_Ordinal.hpp b/compactds/Tree_Cardinal_Ordinal.hpp new file mode 100644 index 0000000..f3e5158 --- /dev/null +++ b/compactds/Tree_Cardinal_Ordinal.hpp @@ -0,0 +1,167 @@ +#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL_ORDINAL +#define _MOURISL_COMPACTDS_TREE_CARDINAL_ORDINAL + +// Cardinal tree, where we use ordinal compact tree to store the structure +// and another bit vector to represent the concatenating labels + +#include "Tree_Cardinal.hpp" +#include "Tree_Cardinal_Plain.hpp" +#include "Bitvector_Plain.hpp" + +#include "Tree_BP.hpp" +#include "Tree_DFUDS.hpp" + +namespace compactds { + +template +class Tree_Cardinal_Ordinal: public Tree_Cardinal +{ +private: + TreeClass _t ; + BvClass _B ; // concatenated labeling showing whether the children for this label exist +public: + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + // Use c bits per node. + void Init(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t c, size_t *treeIdMap) + { + size_t i, j ; + + _n = n ; + _c = c ; + _t.InitFromCardinalTree(treeNodes, n, c, treeIdMap) ; + + + WORD *W = Utils::MallocByBits(c * n) ; + for (i = 0 ; i < n ; ++i) + { + size_t k = treeIdMap[i] ; + for (j = 0 ; j < c ; ++j) + { + if (treeNodes[i].children[j] != 0) + Utils::BitSet(W, _c * k + j) ; + } + } + _B.Init(W, c * n) ; + free(W) ; + + _space = _t.GetSpace(false) + _B.GetSpace() - sizeof(_B) ; + } + + // The index in B + size_t Root() const + { + return _t.Root() ; + } + + // @return: the t-th child (1-based) of node v in B vector + size_t ChildSelect(size_t v, size_t t) const + { + return _t.ChildSelect(v, t) ; + } + + size_t FirstChild(size_t v) const + { + return _t.FirstChild(v) ; + } + + size_t LastChild(size_t v) const + { + return _t.LastChild(v) ; + } + + size_t ChildrenCount(size_t v) const + { + return _t.ChildrenCount(v) ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + return _t.ChildRank(v) ; + } + + // The silbing function assumes v has + // those siblings. + size_t NextSibling(size_t v) const + { + return _t.NextSibling(v) ; + } + + size_t PrevSibling(size_t v) const + { + return _t.PrevSibling(v) ; + } + + size_t Parent(size_t v) const + { + return _t.Parent(v) ; + } + + bool IsLeaf(size_t v) const + { + return _t.IsLeaf(v) ; + } + + size_t LCA(size_t u, size_t v) const + { + return _t.LCA(u, v) ; + } + + // Maps index in B (v) back up to the actual node id + size_t NodeMap(size_t v) const + { + return _t.NodeMap(v) ; + } + + //Map actual node id to index in B (v). + size_t NodeSelect(size_t i) const + { + return _t.NodeSelect(i) ; + } + + // Number of children with label l. 1: has such children. 0-don't + size_t ChildrenLabeled(size_t v, size_t l) const + { + return (_B.Access(NodeMap(v) * _c + l) == 1) ? 1 : 0 ; + } + + // The child with label l. + // Assuming label l's child exist + // Notice the difference from Child() + size_t LabeledChild(size_t v, size_t l) const + { + size_t k = NodeMap(v) ; + size_t r = _B.Rank1(k * _c + l) - _B.Rank1(k * _c) ; + return ChildSelect(v, r + 1) ; + } + + // The label of the edge that leads to node v. + size_t ChildLabel(size_t v) const + { + size_t p = NodeMap(Parent(v)) ; + size_t r = ChildRank(v) ; + return _B.Select( _B.Rank1(p * _c, 0) + r) - p * _c ; + } + + void Save(FILE *fp) + { + Tree_Cardinal::Save(fp) ; + _t.Save(fp) ; + _B.Save(fp) ; + } + + void Load(FILE *fp) + { + Tree_Cardinal::Load(fp) ; + _t.Save(fp) ; + _B.Load(fp) ; + } +} ; + +} // end of name space + +#endif diff --git a/compactds/Tree_Cardinal_Plain.hpp b/compactds/Tree_Cardinal_Plain.hpp new file mode 100644 index 0000000..fcdaafb --- /dev/null +++ b/compactds/Tree_Cardinal_Plain.hpp @@ -0,0 +1,255 @@ +// Cardinal tree, plain representation +// +#ifndef _MOURISL_COMPACTDS_TREE_CARDINAL_PLAIN +#define _MOURISL_COMPACTDS_TREE_CARDINAL_PLAIN + +#include "Tree_Cardinal.hpp" + +namespace compactds { +struct _plainCardinalTreeNode +{ + size_t k ; // It is the k-th(0-based) children of the parent + size_t parent ; + size_t *children ; + + _plainCardinalTreeNode() + { + children = NULL ; + } + + ~_plainCardinalTreeNode() {} ; + + _plainCardinalTreeNode(size_t p, size_t inK, size_t c) + { + parent = p ; + k = inK ; + children = (size_t *)calloc(c, sizeof(size_t)) ; + } + + void Free() + { + if (children) + { + free(children) ; + children = NULL ; + } + } + + void Save(FILE *fp, size_t c) + { + SAVE_VAR(fp, parent) ; + SAVE_ARR(fp, children, c) ; + } + + void Load(FILE *fp, size_t c) + { + Free() ; + + LOAD_VAR(fp, parent) ; + children = (size_t *)calloc(c, sizeof(size_t)) ; + LOAD_ARR(fp, children, c) ; + } +} ; + +class Tree_Cardinal_Plain : public Tree_Cardinal +{ +private: + std::vector _nodes ; + size_t _c ; //child count +public: + Tree_Cardinal_Plain() {} ; + ~Tree_Cardinal_Plain() + { + Free() ; + } + + void Init(size_t childCount) + { + _c = childCount ; + struct _plainCardinalTreeNode node(0, 0, _c) ; + _nodes.push_back(node) ; + _n = 1 ; + } + + void Free() + { + size_t i ; + for (i = 0 ; i < _n ; ++i) + _nodes[i].Free() ; + _n = 0 ; + } + + size_t GetSpace(bool inclusive = true) + { + return _nodes.capacity() * sizeof(_nodes[0]) + (inclusive ? sizeof(*this) : 0) ; + } + + size_t AddNode(size_t parent, size_t k) + { + size_t id = _nodes.size() ; + struct _plainCardinalTreeNode node(parent, k, _c) ; + + _nodes[parent].children[k] = id ; + _nodes.push_back(node) ; + ++_n ; + + return id ; + } + + size_t Root() const + { + return 0 ; + } + + // t-th(1-based) child ; + size_t ChildSelect(size_t v, size_t t) const + { + size_t i ; + size_t cnt = 0 ; + for (i = 0 ; i < _c ; ++i) + { + if (_nodes[v].children[i] != 0) + { + if (cnt == t - 1) + return _nodes[v].children[i] ; + ++cnt ; + } + } + return 0 ; + } + + size_t FirstChild(size_t v) const + { + size_t i ; + for (i = 0 ; i < _c ; ++i) + if (_nodes[v].children[i] != 0) + return _nodes[v].children[i] ; + return 0 ; + } + + size_t LastChild(size_t v) const + { + size_t i ; + for (i = _c - 1 ; i < _c ; --i) + if (_nodes[v].children[i] != 0) + return _nodes[v].children[i] ; + return 0 ; + } + + size_t ChildrenCount(size_t v) const + { + size_t i ; + size_t c = FirstChild(v) ; + for (i = 0 ; c != 0 ; ++i) + c = NextSibling(c) ; + return i ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + if (v == Root()) + return 0 ; + size_t p = Parent(v) ; + size_t ret = 0 ; + size_t i ; + + // inclusive, for rank is always 1-based + for (i = 0 ; i <= _nodes[v].k ; ++i) + if (_nodes[p].children[i] != 0) + ++ret ; + return ret ; + } + + size_t NextSibling(size_t v) const + { + size_t i ; + size_t p = Parent(v) ; + for (i = _nodes[v].k + 1 ; i < _c ; ++i) + if (_nodes[p].children[i] != 0) + return _nodes[p].children[i] ; + return 0 ; + } + + size_t PrevSibling(size_t v) const + { + size_t i ; + size_t p = Parent(v) ; + for (i = _nodes[v].k - 1 ; i < _c ; --i) + if (_nodes[p].children[i] != 0) + return _nodes[p].children[i] ; + return 0 ; + } + + size_t Parent(size_t v) const + { + return _nodes[v].parent ; + } + + bool IsLeaf(size_t v) const + { + return (ChildrenCount(v) == 0) ; + } + + size_t NodeMap(size_t v) const + { + return v ; + } + + size_t NodeSelect(size_t i) const + { + return i ; + } + + // Number of children with label l. 1: has such children. 0-don't + size_t ChildrenLabeled(size_t v, size_t l) const + { + if (_nodes[v].children[l] != 0) + return 1 ; + else + return 0 ; + } + + // The children with label l. + size_t LabeledChild(size_t v, size_t l) const + { + return _nodes[v].children[l] ; + } + + // The label of the edge that leads to node v. + size_t ChildLabel(size_t v) const + { + return _nodes[v].k ; + } + + const std::vector& GetTreeData() const + { + return _nodes ; + } + + void Save(FILE *fp) + { + Tree_Cardinal::Save(fp) ; + size_t i ; + for (i = 0 ; i < _n ; ++i) + _nodes[i].Save(fp, _c) ; + } + + void Load(FILE *fp) + { + std::vector< struct _plainCardinalTreeNode >().swap(_nodes) ; + + Tree_Cardinal::Load(fp) ; + size_t i ; + struct _plainCardinalTreeNode node ; + for (i = 0 ; i < _n ; ++i) + { + node.Load(fp, _c) ; + _nodes.push_back(node) ; + } + } + +} ; +} + +#endif diff --git a/compactds/Tree_DFUDS.hpp b/compactds/Tree_DFUDS.hpp new file mode 100644 index 0000000..11498a6 --- /dev/null +++ b/compactds/Tree_DFUDS.hpp @@ -0,0 +1,283 @@ +#ifndef _MOURISL_COMPACTDS_TREE_DFUDS +#define _MOURISL_COMPACTDS_TREE_DFUDS + +// Depth-First Unary Degree Sequence (Chapter 8.3) +// The implementation details might be different as we are using 0-index +// tree node id (i) as well. +// In the implementation, v is for the index on the encoded bitvector B. + +#include "Tree.hpp" +#include "Tree_Plain.hpp" +#include "Tree_Cardinal_Plain.hpp" +#include "Bitvector_Plain.hpp" + +namespace compactds { +class Tree_DFUDS: public Tree +{ +private: + Bitvector_Plain _B ; + DS_Parenthesis _bp ; // dangling structure, the parentehesis is almost balanced. + + size_t _m ; //|_B|, just for coding simplicity + + void Build(const struct _plainTreeNode *treeNodes, size_t n, size_t tag, size_t *treeIdMap, size_t &visited, size_t &bi) + { + size_t c ; + c = treeNodes[tag].child ; + + treeIdMap[tag] = visited ; + ++visited ; + + for (c = treeNodes[tag].child ; c != 0 ; c = treeNodes[c].sibling) + { + _B.BitSet(bi) ; + ++bi ; + } + ++bi ; // set as 0 + + c = treeNodes[tag].child ; + + for (c = treeNodes[tag].child ; c != 0 ; c = treeNodes[c].sibling) + Build(treeNodes, n, c, treeIdMap, visited, bi) ; + } + + void BuildFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, size_t childCnt, size_t tag, size_t *treeIdMap, size_t &visited, size_t &bi) + { + size_t i ; + treeIdMap[tag] = visited ; + ++visited ; + + for (i = 0 ; i < childCnt; ++i) + { + if (treeNodes[tag].children[i] == 0) + continue ; + _B.BitSet(bi) ; + ++bi ; + } + ++bi ; + + for (i = 0 ; i < childCnt; ++i) + { + if (treeNodes[tag].children[i] == 0) + continue ; + BuildFromCardinalTree(treeNodes, n, childCnt, treeNodes[tag].children[i], treeIdMap, visited, bi) ; + } + } +public: + Tree_DFUDS() {} + ~Tree_DFUDS() + { + Free() ; + } + + void Free() + { + if (_n > 0) + { + _B.Free() ; + _bp.Free() ; + _n = 0 ; + _m = 0 ; + } + } + + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + void Init(const struct _plainTreeNode *treeNodes, size_t n, size_t *treeIdMap) + { + _n = n ; + _B.Malloc(2 * _n - 1) ; + size_t bi = 0 ; + size_t visited = 0 ; + Build(treeNodes, n, 0, treeIdMap, visited, bi) ; + _m = bi ; + + _B.Init() ; + _bp.Init(_B.GetData(), _m, 0, 2) ; + + _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ; + } + + void InitFromCardinalTree(const struct _plainCardinalTreeNode *treeNodes, size_t n, + size_t childCount, size_t *treeIdMap) + { + _n = n ; + _B.Malloc(2 * _n - 1) ; + size_t bi = 0 ; + size_t visited = 0 ; + BuildFromCardinalTree(treeNodes, n, childCount, 0, treeIdMap, visited, bi) ; + _m = bi ; + + _B.Init() ; + _bp.Init(_B.GetData(), _m, 0, 2) ; + + _space = _B.GetSpace() - sizeof(_B) + _bp.GetSpace(false) ; + } + + // The index in B + size_t Root() const + { + return 0 ; + } + + // @return: the t-th child (1-based) of node v in B vector + size_t ChildSelect(size_t v, size_t t) const + { + size_t childCnt = ChildrenCount(v) ; + return _bp.Close( v + childCnt - t, _B.GetData(), _m) + 1 ; + } + + size_t FirstChild(size_t v) const + { + return _B.Succ0(v) + 1 ; + } + + size_t LastChild(size_t v) const + { + return _bp.Close(v, _B.GetData(), _m) + 1 ; + } + + size_t ChildrenCount(size_t v) const + { + return _B.Succ0(v) - v ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + size_t open = _bp.Open(v - 1, _B.GetData(), _m) ; + return _B.Succ0(open) - open ; + } + + // The silbing function assumes v has + // those siblings. + size_t NextSibling(size_t v) const + { + return _bp.GetRmmTree().FwdSearch(v, -1, _B.GetData(), _m) + 1 ; + } + + size_t PrevSibling(size_t v) const + { + // Notice the order here, the closer to the end of the (((), the close + // ) corresponds to the earlier children + return _bp.Close(_bp.Open(v - 1, _B.GetData(), _m) + 1, + _B.GetData(), _m) + 1 ; + } + + size_t Parent(size_t v) const + { + if (v == 0) + return 0 ; + + return _B.Pred0(_bp.Open(v - 1, _B.GetData(), _m)) + 1 ; + } + + // # of nodes in the substree, inclusive. + size_t SubTreeSize(size_t v) const + { + // For 2*m-1 parenthesis in the range: m ')', m-1 '(' + // The equation below is actually (2*m-2)/2 + 1 + return (_bp.GetRmmTree().FwdSearch(v, -1, _B.GetData(), _m) - v) / 2 + 1 ; + } + + // Whether u is an ancestor of v. + bool IsAncestor(size_t u, size_t v) const + { + size_t uend = _bp.GetRmmTree().FwdSearch(u, -1, _B.GetData(), _m) ; + if (v >= u && v <= uend) + return true ; + return false ; + } + + bool IsLeaf(size_t v) const + { + return (_B.Access(v) == 0) ; + } + + size_t LCA(size_t u, size_t v) const + { + if (v < u) + { + size_t tmp = v ; + v = u ; + u = tmp ; + } + + if (IsAncestor(u, v)) + return u ; + //printf("%s: %d %d. %d\n", __func__, u, v, + // _bp.GetRmmTree().Rmq(u, v - 1, _B.GetData(), _m)) ; + + // Think about this more. + // Example (())): node with two leaves + // v-1 in Rmq then add 1 back handles both the leaf case and internal node case + return Parent( _bp.GetRmmTree().Rmq(u, v - 1, + _B.GetData(), _m) + 1) ; + } + + size_t LeafCountInSubTree(size_t v) const + { + if (IsLeaf(v)) + return 1 ; + else + { + size_t vend = _bp.GetRmmTree().FwdSearch(v, -1, _B.GetData(), _m) ; + return _bp.PatternRank(vend - 1, _B.GetData(), _m) - _bp.PatternRank(v, _B.GetData(), _m) ; + } + } + + // Rank and select with respect to leaves in _B order + // Assuming v is the leaf + size_t LeafRank(size_t v, int inclusive = 1) const + { + // The end of 00 corresponds to the leaf + return _bp.PatternRank(v - 1, _B.GetData(), _m, inclusive) ; + } + + size_t LeafSelect(size_t i) const + { + return _bp.PatternSelect(i, _B.GetData(), _m) + 1 ; + } + + // Maps index in B (v) back up to the actual node id + size_t NodeMap(size_t v) const + { + // Inclusive==0 because leaf node will have ')' on the index + return _B.Rank(0, v, /*inclusive=*/0) ; + } + + // Map actual node id to index in B (v). + size_t NodeSelect(size_t i) const + { + if (i == 0) + return 0 ; + return _B.Select(0, i) + 1 ; + } + + void Save(FILE *fp) + { + Tree::Save(fp) ; + + _B.Save(fp) ; + _bp.Save(fp) ; + } + + void Load(FILE *fp) + { + Free() ; + + Tree::Load(fp) ; + + _B.Load(fp) ; + _bp.Load(fp) ; + + _m = 2 * _n - 1 ; + } +} ; + +} // end of name space + +#endif diff --git a/compactds/Tree_LOUDS.hpp b/compactds/Tree_LOUDS.hpp new file mode 100644 index 0000000..0450149 --- /dev/null +++ b/compactds/Tree_LOUDS.hpp @@ -0,0 +1,171 @@ +#ifndef _MOURISL_COMPACTDS_TREE_LOUDS +#define _MOURISL_COMPACTDS_TREE_LOUDS + +// Level-order Unary degree sequence (Chapter 8.1) +// The implementation details might be different as we are using 0-index +// tree node id (i) as well. +// In the implementation, v is for the index on the encoded bitvector B. + +#include "Tree.hpp" +#include "Tree_Plain.hpp" +#include "Bitvector_Plain.hpp" + +namespace compactds { +class Tree_LOUDS: public Tree +{ +private: + Bitvector_Plain _B ; +public: + size_t GetSpace(bool inclusive = true) + { + return _space + (inclusive ? sizeof(*this) : 0) ; + } + + void Init(const struct _plainTreeNode *treeNodes, size_t n, size_t *treeIdMap) + { + size_t i ; + _B.Malloc(2 * n - 1) ; // n nodes, n-1 edges + _space = _B.GetSpace() - sizeof(_B); + + // BFS on tree nodes + // The algorithm 8.3 will change the original plain tree + size_t *queue = (size_t *)malloc(sizeof(size_t) * n) ; + size_t qhead, qtail ; + size_t m = 0 ; // position on _B + queue[0] = 0 ; + qhead = 0 ; + qtail = 1 ; + while (qhead < qtail) + { + size_t node = queue[qhead] ; + if (treeIdMap != NULL) + treeIdMap[node] = qhead ; + + ++qhead ; + + size_t childCnt = 0 ; + size_t c = treeNodes[node].child ; + for (childCnt = 0 ; c != 0 ; ++childCnt) + { + queue[qtail] = c ; + ++qtail ; + c = treeNodes[c].sibling ; + } + + for (i = m ; i < m + childCnt ; ++i) + _B.BitSet(i) ; + m += childCnt + 1 ; + } + free(queue) ; + + _B.Init() ; + } + + // The index in B + size_t Root() const + { + return 0 ; + } + + // @return: the t-th child (1-based) of node v in B vector + size_t ChildSelect(size_t v, size_t t) const + { + return _B.Select(0, _B.Rank(1, v + t - 1)) + 1 ; + } + + size_t FirstChild(size_t v) const + { + return ChildSelect(v, 1) ; + } + + size_t LastChild(size_t v) const + { + return ChildSelect(v, ChildrenCount(v)) ; + } + + size_t ChildrenCount(size_t v) const + { + return _B.Succ0(v) - v ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + if (v == Root()) + return 0 ; + //_B.Rank(0, v-1): the node id + //_B.Select(1, _B.Rank(0, v-1)): The edge connect the parent and v + size_t j = _B.Select(1, _B.Rank(0, v - 1)) ; + return j - _B.Pred0(j) ; // rank is always 1-based + } + + // The silbing function assumes v has + // those siblings. + size_t NextSibling(size_t v) const + { + return _B.Succ0(v) + 1 ; + } + + size_t PrevSibling(size_t v) const + { + return _B.Pred0(v - 2) + 1 ; + } + + size_t Parent(size_t v) const + { + if (v == Root()) + return 0 ; + size_t j = _B.Select(1, _B.Rank(0, v - 1)) ; + return _B.Pred0(j) + 1 ; + } + + bool IsLeaf(size_t v) const + { + return _B.Access(v) == 0 ; + } + + size_t LCA(size_t u, size_t v) const + { + while (u != v) + { + if (u > v) + u = Parent(u) ; + else + v = Parent(v) ; + } + return u ; + } + + // Maps index in B (v) back up to the actual node id + size_t NodeMap(size_t v) const + { + // Exclude current 0 in case v is the leaf. + return _B.Rank(0, v, /*inclusive=*/0) ; + } + + //Map actual node id to index in B (v). + size_t NodeSelect(size_t i) const + { + if (i == 0) + return Root() ; + else + return _B.Select(0, i) + 1 ; + } + + void Save(FILE *fp) + { + Tree::Save(fp) ; + _B.Save(fp) ; + } + + void Load(FILE *fp) + { + Tree::Load(fp) ; + _B.Load(fp) ; + } + +} ; + +} // end of name space + +#endif diff --git a/compactds/Tree_Labeled.hpp b/compactds/Tree_Labeled.hpp new file mode 100644 index 0000000..3f63a95 --- /dev/null +++ b/compactds/Tree_Labeled.hpp @@ -0,0 +1,256 @@ +#ifndef _MOURISL_COMPACTDS_TREE_LABELED +#define _MOURISL_COMPACTDS_TREE_LABELED + +// Labeled tree. The difference from the text book that we +// assume general representation of the tree structure, including BP. +// Therefore, for the concatenated children labels, we have an +// additional bit vector to indicate the start of each children label series, +// and we also have a place holder for the start position in the labels +// (this could be redundant, but having a explicit bit marker is more efficient) + +#include "Bitvector_Plain.hpp" +#include "Tree.hpp" + +#include "Sequence_WaveletTree.hpp" + +#include "Tree_LOUDS.hpp" +#include "Tree_BP.hpp" +#include "Tree_DFUDS.hpp" + +#include +#include + +namespace compactds { + +template , + class SequenceMarkerClass = Bitvector_Plain> +class Tree_Labeled: public Tree +{ +private: + TreeClass _t ; + SequenceClass _l ; + SequenceMarkerClass _lmarker ; // markers on l, indicating the start of the labels from a node + std::map _lmap ; // mapping label from size_to to ALPHABET + std::vector _lmapback ; // map labels from ALPAHBET back to original value +public: + size_t GetSpace(bool inclusive = true) + { + return _space + _lmap.size() * (sizeof(size_t) + sizeof(ALPHABET)) + _lmapback.capacity() + (inclusive ? sizeof(*this) : 0) ; + } + + // Use c bits per node. + void Init(const struct _plainTreeNode *treeNodes, size_t n, + size_t *treeIdMap) + { + size_t i ; + _n = n ; + _t.Init(treeNodes, n, treeIdMap) ; + + for (i = 1 ; i < n ; ++i) + { + if (_lmap.find(treeNodes[i].label) == _lmap.end()) + { + ALPHABET id = _lmap.size() ; + _lmap[ treeNodes[i].label ] = id ; + _lmapback.push_back( treeNodes[i].label ) ; + } + } + + std::vector alphabetList ; + size_t lmapSize = _lmap.size() ; + for (i = 0 ; i < lmapSize ; ++i) + alphabetList.push_back((ALPHABET)i) ; + + ALPHABET placeHolder = lmapSize ; + alphabetList.push_back(placeHolder) ; + ++lmapSize ; + + FixedSizeElemArray childrenLabels ; + Alphabet labelAlphabet ; + int lmapBits = labelAlphabet.InitFromList(alphabetList.data(), lmapSize) ; + childrenLabels.Malloc(lmapBits, 2 * n - 1) ; + + WORD *W = Utils::MallocByBits(2 * n - 1) ; + size_t lused = 0 ; + size_t *nodeOrder = (size_t *)malloc(sizeof(*nodeOrder) * n) ; + for (i = 0 ; i < n ; ++i) + nodeOrder[ treeIdMap[i] ] = i ; + for (i = 0 ; i < n ; ++i) + { + size_t k = nodeOrder[i] ; + size_t c = treeNodes[k].child ; + + Utils::BitSet(W, lused) ; //mark the start of the child series + childrenLabels.Write(lused, placeHolder) ; + ++lused ; + while (c != 0) + { + childrenLabels.Write(lused, _lmap[treeNodes[c].label]) ; + ++lused ; + + c = treeNodes[c].sibling ; + } + } + free(nodeOrder) ; + + _l.SetAlphabet(labelAlphabet) ; + _l.Init(childrenLabels, 2 * n - 1, alphabetList.data()) ; + _lmarker.Init(W, 2 * n - 1) ; + + free(W) ; + + _space = _t.GetSpace(false) + _l.GetSpace() - sizeof(_l) + _lmarker.GetSpace() - sizeof(_lmarker) ; + } + + // The index in B + size_t Root() const + { + return _t.Root() ; + } + + // @return: the t-th child (1-based) of node v in B vector + size_t ChildSelect(size_t v, size_t t) const + { + return _t.ChildSelect(v, t) ; + } + + size_t FirstChild(size_t v) const + { + return _t.FirstChild(v) ; + } + + size_t LastChild(size_t v) const + { + return _t.LastChild(v) ; + } + + size_t ChildrenCount(size_t v) const + { + return _t.ChildrenCount(v) ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + return _t.ChildRank(v) ; + } + + // The silbing function assumes v has + // those siblings. + size_t NextSibling(size_t v) const + { + return _t.NextSibling(v) ; + } + + size_t PrevSibling(size_t v) const + { + return _t.PrevSibling(v) ; + } + + size_t Parent(size_t v) const + { + return _t.Parent(v) ; + } + + bool IsLeaf(size_t v) const + { + return _t.IsLeaf(v) ; + } + + size_t LCA(size_t u, size_t v) const + { + return _t.LCA(u, v) ; + } + + // Maps index in B (v) back up to the actual node id + size_t NodeMap(size_t v) const + { + return _t.NodeMap(v) ; + } + + //Map actual node id to index in B (v). + size_t NodeSelect(size_t i) const + { + return _t.NodeSelect(i) ; + } + + // Number of children with label l. 1: has such children. 0-don't + size_t ChildrenLabeled(size_t v, size_t l) const + { + size_t i = NodeMap(v) ; + ALPHABET lmapped = _lmap.at(l) ; + + size_t start = _lmarker.Select(i + 1) ; + if (i == _n - 1) + { + return _l.Rank(lmapped, 2 * _n - 2) - _l.Rank(lmapped, start); + } + else + { + return _l.Rank(lmapped, _lmarker.Select(i + 2) - 1) - _l.Rank(lmapped, start) ; + } + } + + // The t-th child with label l. + size_t LabeledChildSelect(size_t v, size_t l, size_t t) const + { + size_t i = NodeMap(v) ; + ALPHABET lmapped = _lmap.at(l) ; + size_t start = _lmarker.Select(i + 1) ; + + size_t childRank = _l.Select(lmapped, _l.Rank(lmapped, start) + t) - start ; + return ChildSelect(v, childRank) ; + } + + // The label of the edge that leads to node v. + size_t ChildLabel(size_t v) const + { + if (v == Root()) + return 0 ; + + size_t childRank = ChildRank(v) ; + size_t p = Parent(v) ; + return _lmapback.at( _l.Access( _lmarker.Select(NodeMap(p) + 1) + childRank) ) ; + } + + void Save(FILE *fp) + { + Tree::Save(fp) ; + _t.Save(fp) ; + _l.Save(fp) ; + _lmarker.Save(fp) ; + + size_t size = _lmapback.size() ; + SAVE_VAR(fp, size) ; + size_t i ; + for (i = 0 ; i < size ; ++i) + { + SAVE_VAR(fp, _lmapback[i]) ; + } + } + + void Load(FILE *fp) + { + Tree::Load(fp) ; + _t.Save(fp) ; + _l.Load(fp) ; + _lmarker.Save(fp) ; + + _lmap.clear() ; + _lmapback.clear() ; + size_t lmapSize ; + LOAD_VAR(fp, lmapSize) ; + size_t i ; + for (i = 0 ; i < lmapSize ; ++i) + { + size_t l ; + LOAD_VAR(fp, l) ; + _lmapback.push_back(l) ; + _lmap[l] = i ; + } + } +} ; + +} // end of name space + +#endif diff --git a/compactds/Tree_Plain.hpp b/compactds/Tree_Plain.hpp new file mode 100644 index 0000000..a2be8a1 --- /dev/null +++ b/compactds/Tree_Plain.hpp @@ -0,0 +1,277 @@ +#ifndef _MOURISL_COMPACTDS_TREE_PLAIN +#define _MOURISL_COMPACTDS_TREE_PLAIN + +#include + +#include "Tree.hpp" +#include "SimpleVector.hpp" + +namespace compactds { +struct _plainTreeNode +{ + size_t parent ; + size_t sibling ; + size_t child ; + size_t lastChild ; + + size_t label ; // the label from the parent to itself + + _plainTreeNode(size_t p, size_t s, size_t c, size_t lc) + { + parent = p ; + sibling = s ; + child = c ; + lastChild = lc ; + label = 0 ; + } + + void Save(FILE *fp) + { + SAVE_VAR(fp, parent) ; + SAVE_VAR(fp, sibling) ; + SAVE_VAR(fp, child) ; + SAVE_VAR(fp, lastChild) ; + } + + void Load(FILE *fp) + { + LOAD_VAR(fp, parent) ; + LOAD_VAR(fp, sibling) ; + LOAD_VAR(fp, child) ; + LOAD_VAR(fp, lastChild) ; + } +} ; + +class Tree_Plain: public Tree +{ +private: + std::vector _nodes ; +public: + Tree_Plain() {} + ~Tree_Plain() {} + + void Init() + { + _n = 1 ; + + struct _plainTreeNode node(0, 0, 0, 0) ; + _nodes.push_back(node) ; + } + + size_t GetSpace(bool inclusive = true) + { + return _nodes.capacity() * sizeof(struct _plainTreeNode) + (inclusive ? sizeof(*this) : 0) ; + } + + // Assumes parent is already in the tree + //@return: tree index + size_t AddNode(size_t parent) + { + size_t id = _nodes.size() ; + struct _plainTreeNode node(parent, 0, 0, 0) ; + size_t lastSibling = LastChild(parent) ; + + if (lastSibling == 0) + _nodes[parent].child = id ; + else + _nodes[lastSibling].sibling = id ; + + _nodes[parent].lastChild = id ; + _nodes.push_back(node) ; + ++_n ; + + return id ; + } + + size_t Root() const + { + return 0 ; + } + + // t-th(1-based) child ; + size_t ChildSelect(size_t v, size_t t) const + { + --t ; + + size_t c = FirstChild(v) ; + size_t i ; + for (i = 0 ; i < t ; ++i) + c = NextSibling(c) ; + return c ; + } + + size_t FirstChild(size_t v) const + { + return _nodes[v].child ; + } + + size_t LastChild(size_t v) const + { + return _nodes[v].lastChild ; + } + + size_t ChildrenCount(size_t v) const + { + size_t i ; + size_t c = FirstChild(v) ; + for (i = 0 ; c != 0 ; ++i) + c = NextSibling(c) ; + return i ; + } + + // return: v is the ret-th (1-based) child of the parent. + size_t ChildRank(size_t v) const + { + if (v == Root()) + return 0 ; + size_t c = FirstChild( Parent(v) ) ; + size_t i ; + for (i = 0 ; c != v ; ++i) + c = NextSibling(c) ; + return i + 1 ; // +1: rank is always 1-based + } + + size_t NextSibling(size_t v) const + { + return _nodes[v].sibling ; + } + + size_t PrevSibling(size_t v) const + { + size_t i ; + size_t c = FirstChild( Parent(v) ) ; + for (i = 0 ; v != NextSibling(c) ; ++i) + c = NextSibling(c) ; + return c ; + } + + size_t Parent(size_t v) const + { + return _nodes[v].parent ; + } + + bool IsLeaf(size_t v) const + { + if (_nodes[v].child == 0) + return true ; + return false ; + } + + size_t LCA(size_t u, size_t v) const + { + SimpleVector upath ; + SimpleVector vpath ; + + size_t p ; + + upath.PushBack(u) ; + p = Parent(u) ; + while (p != 0) + { + upath.PushBack(p) ; + p = Parent(p) ; + } + upath.PushBack(0) ; + + vpath.PushBack(v) ; + p = Parent(v) ; + while (p != 0) + { + vpath.PushBack(p) ; + p = Parent(p) ; + } + vpath.PushBack(0) ; + + upath.Reverse() ; + vpath.Reverse() ; + + size_t size = MIN(upath.Size(), vpath.Size()) ; + size_t i ; + for (i = 0 ; i < size; ++i) + if (upath[i] != vpath[i]) + break ; + return upath[i - 1] ; + } + + size_t NodeMap(size_t v) const + { + return v ; + } + + size_t NodeSelect(size_t i) const + { + return i ; + } + + const std::vector& GetTreeData() const + { + return _nodes ; + } + + void SetLabel(size_t v, size_t l) + { + _nodes[v].label = l ; + } + + // Number of children with label l. 1: has such children. 0-don't + size_t ChildrenLabeled(size_t v, size_t l) const + { + size_t ret = 0 ; + int c = _nodes[v].child ; + while (c != 0) + { + if (_nodes[c].label == l) + ++ret ; + c = _nodes[c].sibling ; + } + return ret ; + } + + // The child with label l. + size_t LabeledChildSelect(size_t v, size_t l, size_t t) const + { + size_t cnt = 0 ; + size_t c = _nodes[v].child ; + while (c != 0) + { + if (_nodes[c].label == l) + ++cnt ; + if (cnt >= t) + break ; + c = _nodes[c].sibling ; + } + return c ; + } + + // The label of the edge that leads to node v. + size_t ChildLabel(size_t v) const + { + return _nodes[v].label ; + } + + void Save(FILE *fp) + { + Tree::Save(fp) ; + size_t i ; + for (i = 0 ; i < _n ; ++i) + _nodes[i].Save(fp) ; + } + + void Load(FILE *fp) + { + std::vector< struct _plainTreeNode >().swap(_nodes) ; + + Tree::Load(fp) ; + size_t i ; + struct _plainTreeNode node(0, 0, 0, 0) ; + for (i = 0 ; i < _n ; ++i) + { + node.Load(fp) ; + _nodes.push_back(node) ; + } + } +} ; + +} + +#endif diff --git a/compactds/UniversalHashGenerator.hpp b/compactds/UniversalHashGenerator.hpp new file mode 100644 index 0000000..636be0b --- /dev/null +++ b/compactds/UniversalHashGenerator.hpp @@ -0,0 +1,81 @@ +#ifndef _MOURISL_COMPACTDS_UNIVERSALHASHGENERATOR +#define _MOURISL_COMPACTDS_UNIVERSALHASHGENERATOR + +#include "Utils.hpp" + +// Universal hash family of ((a*x+b)%p)%m + +namespace compactds { +class UniversalHashGenerator +{ +private: + const uint64_t p ; // The largest prime in 63bit, so 2*p can be in 64bit + uint32_t state ; + uint64_t m ; + + // Lehmer random generator + // https://en.wikipedia.org/wiki/Lehmer_random_number_generator + uint32_t Random() + { + return state = (state * 279470273ull) % 0xfffffffb; + } + + // Not really 64 bit due to 0xfffffffb, but close enough + uint64_t Random64() + { + uint32_t lower32 = Random() ; + uint32_t upper32 = Random() ; + return (upper32 * 0xfffffffbull) + lower32 ; + } +public: + UniversalHashGenerator():p(9223372036854775783ull) {} + ~UniversalHashGenerator() {} + + size_t GetSpace() {return sizeof(*this);} + + // map [0..n] to the range of [0,..,m-1] + // @return: the big prime p. 0 if failed + uint64_t Init(uint64_t m, uint32_t seed) + { + if (seed == 0) + seed = 17 ; + state = seed ; + this->m = m ; + + return p ; + } + + size_t GetP() + { + return p; + } + + void SetSeed(uint32_t seed) + { + state = seed ; + } + + // Generate a pair of (a, b) + void Generate(uint64_t &a, uint64_t &b) + { + a = Random64() ; + if (a == 0) + { + state = 17 ; + a = Random64() ; + } + b = Random64() ; + } + + // Though the outside program should have enough information + // to do the mapping on its own, we provide the function here + // for convenience. + // The function should handle the overflow of a*x + uint64_t Map(uint64_t a, uint64_t b, uint64_t x) + { + return (Utils::SafeMultiMod(x, a, p) + b)%p % m ; + } +} ; +} + +#endif diff --git a/compactds/Utils.hpp b/compactds/Utils.hpp new file mode 100644 index 0000000..e4091c9 --- /dev/null +++ b/compactds/Utils.hpp @@ -0,0 +1,292 @@ +#ifndef _MOURISL_COMPACTDS_UTILS +#define _MOURISL_COMPACTDS_UTILS + +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __SSE4_2__ +#include +#endif + +namespace compactds { +#define WORD_64 // comment this out if word size is 32 + +#ifdef WORD_64 + typedef uint64_t WORD ; + #define WORDBITS 64 + #define WORDBYTES 8 + #define WORDBITS_WIDTH 6 +#else + typedef uint32_t WORD ; + #define WORDBITS 32 + #define WORDBYTES 4 + #define WORDBITS_WIDTH 6 + #define WORDBITS_WIDTH 5 +#endif + +#define DIV_CEIL(x,y) (((x)%(y))?((x)/(y)+1):((x)/(y))) +#define CEIL(x) (((int)(x) == (x))?((int)(x)):((int)(x) + 1)) +#define MIN(x,y) ((x)<=(y)?(x):(y)) +#define MAX(x,y) ((x)<=(y)?(y):(x)) + +// Create a mask of l 1s +#define MASK_WCHECK(l) (((l)>=(int)WORDBITS)?(0xffffffffffffffff):((1ull<<(l))-1ull)) +#define MASK(l) ((1ull<<((uint64_t)(l)))-1ull) + +// positive infinity +#define POSITIVE_INF ((uint64_t)-1) + +// x-y modules by k-bit block , which are wide words has k bits subblocks +// h masks/controls the block size +// Sebastiano Vigna, Broadword implementation of rank/select queries, 2008 +#define BITBLOCK_MODDIFF(x,y,h) (((x)|(h)) - ((y)&(~(h)))^(((x)^(~(y))&(h)))) +// Test x0 in a subblock fashion +#define BITBLOCK_GZERO(x, l, h) (((((x)|(h))-(l)) | (x)) & (h)) + +#define SAVE_VAR(fp, x) (fwrite(&(x), sizeof(x), 1, (fp))) +#define LOAD_VAR(fp, x) (fread(&(x), sizeof(x), 1, (fp))) + +#define SAVE_ARR(fp, x, n) (fwrite((x), sizeof(*(x)), (n), (fp))) +#define LOAD_ARR(fp, x, n) (fread((x), sizeof(*(x)), (n), (fp))) + +class Utils +{ +public: + // How many bits in the input x + static int CountBits(WORD x) + { + int ret = 0 ; + for (; x ; x >>= 1) + ++ret ; + return ret ; + } + + // Count the number of 1's in x. + static int Popcount(WORD x) + { +#ifdef __SSE4_2__ + return __builtin_popcountll(x); +#else +#ifdef WORD_64 + x = x - ((x >> 1) & 0x5555555555555555ull) ; + x = (x&0x3333333333333333ull) + ((x>>2)&0x3333333333333333ull) ; + return (((x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full) * 0x0101010101010101ull) >> 56 ; +#else + x = x - ((x >> 1) & 0x55555555) ; + x = (x&0x33333333) + ((x>>2)&0x33333333) ; + return (((x + (x >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24 ; +#endif +#endif + /*else + { + int ret = 0 ; + for (; x ; x &= (x-1)) + ++ret ; + return ret ; + }*/ + } + + // Select the r-th (1-index) 1 in word x + static int SelectInWord(WORD x, int r) + { + const uint64_t l8 = 0x0101010101010101ull ; + const uint64_t h8 = l8 << 7ull ; + --r ; + + uint64_t s, b, l ; + // Calculate the byte-wise partial sums + s = x - ((x & 0xAAAAAAAAAAAAAAAAull) >> 1) ; + s = (s & 0x3333333333333333ull) + ((s>>2) & 0x3333333333333333ull) ; + s = ((s + (s>>4))&0x0f0f0f0f0f0f0f0full) * l8 ; + // Locate the byte + // >> 53 is kind of make the byte unit to bit unit (>>56 << 3), makes later shift easier. + b = (((BITBLOCK_LEQ(s, r * l8, h8)>>7) * l8) >> 53) & (~7ull) ; + l = r - (((s<<8) >> b) & 0xff) ; // update remainder + // Seems the 0x804..01ull trick is to expand the bit information into each byte + // each bit in a byte will be in its own byte of a 64bit integer + s = (BITBLOCK_GZERO(((x >> b & 0xff) * l8 & 0x8040201008040201ull), l8, h8) >> 7) * l8 ; + return b + (((BITBLOCK_LEQ(s, l * l8, h8) >> 7) * l8) >> 56) ; + + } + + // Compute ceil(log2(x)) without float computation + static int Log2Ceil(WORD x) + { + int bcnt = CountBits(x) ; + if (x == (1ull<<(bcnt - 1))) + return bcnt - 1 ; + else + return bcnt ; + } + + // The power function in the integer space + static uint64_t PowerInt(int x, int y) + { + uint64_t ret = 1 ; + uint64_t powerx = x ; + while (y) + { + if (y & 1) + ret *= powerx ; + powerx *= powerx ; + y >>= 1 ; + } + return ret ; + } + + // The multiplication then take mode: (a*b)%m + // that make sure a*b not overflow + static uint64_t SafeMultiMod(uint64_t a, uint64_t b, uint64_t m) + { + uint64_t ret = 0 ; + a %= m ; + while (b) + { + if (b & 1) + ret += a%m ; + a = (a * 2)%m ; + b >>= 1 ; + } + return ret ; + } + + // Assuming only span two words at most. + // s is j', e is j + // Get B[s..e] + static WORD BitsRead(const WORD *W, const size_t s, const size_t e) + { + // In practice we should let other part be correct about this + //if (s > e) + // return 0 ; + + const size_t ie = e >> WORDBITS_WIDTH ; // index for e + const size_t is = s >> WORDBITS_WIDTH ; + + const int rs = s & (WORDBITS - 1) ; + + if (ie == is) + { + // in the same block + return (W[ie] >> rs) & MASK_WCHECK(e-s+1) ; + } + else + { + const int re = e & (WORDBITS - 1) ;// e%w, the residual offset within a word + // Since ie!=is, re must be less than 63, so we don't need to check the MASK. + return (W[is] >> rs) | ((W[ie] & MASK(re + 1)) << (WORDBITS - rs)) ; + } + } + + // Write B[s..e]=x. + static void BitsWrite(WORD *W, size_t s, size_t e, WORD x) + { + if (s > e) + return ; + const int w = sizeof(WORD) * 8 ; + int re = e & (w - 1) ;// e%w, the residual offset within a word + int rs = s & (w - 1) ; + + size_t ie = e/w ; // index for e + size_t is = s/w ; + + if (ie == is) + { + W[ie] = (W[ie] & ~(MASK_WCHECK(e-s+1) << rs)) | ((WORD)x<> (w-rs)) ; + } + } + + static int BitRead(const WORD *W, size_t i) + { + return (W[i>>WORDBITS_WIDTH] >> (i&(WORDBITS-1)))&1ull ; + } + + static void BitSet(WORD *W, size_t i) + { + W[i>>WORDBITS_WIDTH] |= (1ull << (i&(WORDBITS-1))) ; + } + + static void BitFlip(WORD *W, size_t i) + { + W[i>>WORDBITS_WIDTH] ^= (1ull << (i&(WORDBITS-1))) ; + } + + static void BitClear(WORD *W, size_t i) + { + if (BitRead(W, i)) + W[i>>WORDBITS_WIDTH] -= (1ull << (i&(WORDBITS-1))) ; + } + + static size_t BitsToWordBytes(size_t l) + { + return sizeof(WORD) * DIV_CEIL(l, sizeof(WORD)*8) ; + } + + static size_t BitsToWords(size_t l) + { + return DIV_CEIL(l, sizeof(WORD)*8) ; + } + + static WORD *MallocByBits(size_t l) + { + return (WORD *)calloc(BitsToWords(l), sizeof(WORD)) ; + } + + // Translate the space usage description (TB, GB, MB, KB) to bytes + static size_t SpaceStringToBytes(const char *s) + { + int i ; + size_t ret = 0 ; + for (i = 0 ; s[i] >= '0' && s[i] <= '9' ; ++i) + ret = ret * 10 + s[i] - '0' ; + + switch (s[i]) + { + case 'T': + case 't': + ret *= 1000000000000 ; break ; + case 'G': + case 'g': + ret *= 1000000000 ; break ; + case 'M': + case 'm': + ret *= 1000000 ; break ; + case 'K': + case 'k': + ret *= 1000 ; break ; + } + + return ret ; + } + + static void PrintLog( const char *fmt, ... ) + { + va_list args ; + va_start( args, fmt ) ; + char buffer[500] ; + vsprintf( buffer, fmt, args ) ; + + time_t mytime = time(NULL) ; + struct tm *localT = localtime( &mytime ) ; + char stime[500] ; + strftime( stime, sizeof( stime ), "%c", localT ) ; + fprintf( stderr, "[%s] %s\n", stime, buffer ) ; + } +} ; +} +#endif diff --git a/compactds/VariableSizeElemArray.hpp b/compactds/VariableSizeElemArray.hpp new file mode 100644 index 0000000..b67d0f9 --- /dev/null +++ b/compactds/VariableSizeElemArray.hpp @@ -0,0 +1,33 @@ +#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY +#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY + +#include + +#include "Utils.hpp" +#include "FixedSizeElemArray.hpp" + +/* + * The class for the array where each element has variable size + */ +namespace compactds { +class VariableSizeElemArray +{ +public: + VariableSizeElemArray() {} + + ~VariableSizeElemArray() {} + + virtual void Free() = 0; + + // Create the variable size element array + // b - block size + // in - input array + // n - the length of input array + // + virtual void InitFromArray(int b, const unsigned int *in, const size_t &n) = 0 ; + + virtual unsigned int Read(size_t i) = 0 ; +} ; +} + +#endif diff --git a/compactds/VariableSizeElemArray_DensePointers.hpp b/compactds/VariableSizeElemArray_DensePointers.hpp new file mode 100644 index 0000000..0984534 --- /dev/null +++ b/compactds/VariableSizeElemArray_DensePointers.hpp @@ -0,0 +1,144 @@ +#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DENSEPOINTERS +#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DENSEPOINTERS + +#include + +#include "Utils.hpp" +#include "FixedSizeElemArray.hpp" + +#include "VariableSizeElemArray.hpp" + +/* + * The class for the array where each element has variable size + * Implement with dense pointers for constant time access (Section 3.2.2) + */ +namespace compactds { +class VariableSizeElemArray_DensePointers: public VariableSizeElemArray +{ +private: + WORD *M ; // the compressed data + size_t *P ; // sampled pointer + FixedSizeElemArray offsets ; // the offset within each block + int b ; + size_t n ; + int lastPosInM ; // the last position used in M + int space ; +public: + VariableSizeElemArray_DensePointers() + { + M = NULL ; + P = NULL ; + space = 0 ; + } + + ~VariableSizeElemArray_DensePointers() + { + Free() ; + } + + void Free() + { + if (M != NULL) + free(M) ; + if (P != NULL) + free(P) ; + offsets.Free() ; + M = NULL ; + P = NULL ; + } + + // Create the variable size element array + // b - block size. has to be > 1 + // in - input array (non-negative) + // n - the length of input array + void InitFromArray(int blockSize, const unsigned int *in, const size_t &n) + { + size_t i, j ; + int maxL = 0 ; + size_t totalL = 0 ; + this->n = n ; + for (i = 0 ; i < n ; ++i) + { + int bcnt = Utils::CountBits(in[i] + 1) ; // need to shift 1 to allow 0. + if (bcnt > maxL) + maxL = bcnt ; + totalL += bcnt ; + } + + b = blockSize ; + + if (b <= 1) + { + b = CEIL(sizeof(WORD) * 8 * log(2)) ; // TODO: automatic block size determination + } + + size_t blockCnt = DIV_CEIL(n, b) ; + P = (size_t *)malloc(blockCnt * sizeof(size_t)) ; + offsets.Malloc( Utils::Log2Ceil( (b - 1) * (double)(maxL-1) ), n - blockCnt) ; + space = blockCnt * sizeof(size_t) + offsets.GetSpace() - sizeof(offsets); + + M = Utils::MallocByBits(totalL - n) ; // We don't store the highest bit so -n + space += Utils::BitsToWordBytes(totalL - n) ; + + // Encode the data + size_t sumL = 0 ; + size_t withinOffset = 0 ; + // i for indexing the input array, j for indexing the offsets array + for (i = 0, j = 0 ; i < n ; ++i) + { + int bcnt = Utils::CountBits(in[i] + 1) ; + if (i % b == 0) + { + P[i/b] = sumL ; + withinOffset = 0 ; + } + else // Only store the offet of the first element + { + offsets.Write(j, withinOffset) ; + ++j ; + } + if (in[i] == 0) + continue ; + + Utils::BitsWrite(M, sumL, sumL + bcnt - 1 - 1, (in[i] + 1) & MASK(bcnt - 1)) ; + //if (j > 0) printf("i=%d j=%d: in[i]=%d bcnt-1=%d. sumL=%d withinOffset=%d. offsets[j]=%d |elem|=%d\n", i, j, in[i], bcnt - 1, sumL, withinOffset, offsets.Read(j - 1), offsets.GetElemLength()); + sumL += bcnt - 1 ; + withinOffset += bcnt - 1 ; + } + lastPosInM = sumL - 1 ; + } + + unsigned int Read(size_t i) + { + int pi = i / b ; // index in P + int presidual = i % b ; + int nextpi = (i + 1) / b ; + size_t ms, me ; // start and end in M. + + ms = P[pi] ; + if (presidual > 0) + ms = P[pi] + offsets.Read(i - pi - 1) ; // -pi because each block skip one elemtn in offsets + if (i + 1 < n) + { + if (pi == nextpi) + me = P[pi] + offsets.Read(i + 1 - pi - 1) - 1; + else + me = P[nextpi] - 1 ; + } + else + me = lastPosInM ; + + //printf("\ni=%d: ms=%d me=%d. pi=%d offset=%d\n", i, ms, me, pi, offsets.Read(i - pi)) ; + if (ms > me || (i == 0 && me == (size_t)-1)) + return 0 ; + return (Utils::BitsRead(M, ms, me) | (1<<(me - ms + 1))) - 1; + } + + int GetSpace() + { + return space + sizeof(*this) ; + } +} ; +} + +#endif diff --git a/compactds/VariableSizeElemArray_DirectAccess.hpp b/compactds/VariableSizeElemArray_DirectAccess.hpp new file mode 100644 index 0000000..b2dcca6 --- /dev/null +++ b/compactds/VariableSizeElemArray_DirectAccess.hpp @@ -0,0 +1,76 @@ +#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DIRECTACCESS +#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_DIRECTACCESS + +#include + +#include "Utils.hpp" +#include "Encode.hpp" +#include "FixedSizeElemArray.hpp" +#include "VariableSizeElemArray.hpp" + + +/* + * The class for the array where each element has variable size + */ +namespace compactds { +class VariableSizeElemArray_DirectAccess : public VariableSizeElemArray +{ +private: + WORD **M ; // mark whether this is the last piece + WORD **P ; // the piece of block size b + + int b ; // block size + int levelCnt ; // the number of dimensions for M and P +public: + VariableSizeElemArray() + { + } + + ~VariableSizeElemArray() + { + Free() ; + } + + void Free() + { + } + + // Create the variable size element array + // b - block size + // in - input array + // n - the length of input array + // + void InitFromArray(int blockSize, const unsigned int *in, const size_t &n) + { + int totalL = 0 ; // total bit length + int maxL = 0 ; + int i ; + for (i = 0 ; i < n ; ++i) + { + int bcnt = Utils::CountBits(in[i]) ; + totalL += bcnt ; + if (bcnt > maxL) + maxL = bcnt ; + } + + if (b <= 0) + b = log(n) / log(2) ; //TODO: check + levelCount = DIV_CEIL(maxL, b) ; + + M = (WORD *)malloc(sizeof(WORD *) * levelCount) ; + P = (WORD *)malloc(sizeof(WORD *) * levelCount) ; + + for (i = 0 ; i < n ; ++i) + { + + } + } + + unsigned int Read(int i) + { + return 0 ; + } +} ; +} + +#endif diff --git a/compactds/VariableSizeElemArray_SampledPointers.hpp b/compactds/VariableSizeElemArray_SampledPointers.hpp new file mode 100644 index 0000000..5218abd --- /dev/null +++ b/compactds/VariableSizeElemArray_SampledPointers.hpp @@ -0,0 +1,118 @@ +#ifndef _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_SAMPLEDPOINTERS +#define _MOURISL_COMPACTDS_VARIABLESIZEELEM_ARRAY_SAMPLEDPOINTERS + +#include + +#include "Utils.hpp" +#include "EliasCode.hpp" +#include "FixedSizeElemArray.hpp" + +#include "VariableSizeElemArray.hpp" + +/* + * The class for the array where each element has variable size + * Implement with sampled pointers (Section 3.2.1) + */ +namespace compactds { +class VariableSizeElemArray_SampledPointers: public VariableSizeElemArray +{ +private: + WORD *M ; // the compressed data + size_t *P ; // sampled pointer + int b ; + + int space ; +public: + VariableSizeElemArray_SampledPointers() + { + M = NULL ; + P = NULL ; + space = 0 ; + } + + ~VariableSizeElemArray_SampledPointers() + { + Free() ; + } + + void Free() + { + if (M != NULL) + free(M) ; + if (P != NULL) + free(P) ; + M = NULL ; + P = NULL ; + } + + // Create the variable size element array + // b - block size. has to be > 1 + // in - input array + // n - the length of input array + void InitFromArray(int blockSize, const unsigned int *in, const size_t &n) + { + size_t i ; + size_t totalEncodeBits = 0 ; + for (i = 0 ; i < n ; ++i) + { + int bcnt = Utils::CountBits(in[i] + 1) ; // need to shift 1 to allow 0. + totalEncodeBits += 2 * bcnt - 1 ; // we use gamma encoding because the number of bits for each number is less than 32 in general, which makes it more efficient than delta encoding + } + + b = blockSize ; + + if (b <= 1) + { + b = sizeof(WORD) * 8 ; // extra overhead 1 bit per element + } + + size_t blockCnt = DIV_CEIL(n, b) ; + P = (size_t *)malloc(blockCnt * sizeof(size_t)) ; + space = blockCnt * sizeof(size_t) ; + + M = Utils::MallocByBits(totalEncodeBits) ; // We don't store the highest bit so -n + space += Utils::BitsToWordBytes(totalEncodeBits) ; + + // Encode the data + size_t sumL = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (i % b == 0) + { + P[i/b] = sumL ; + } + + int l ; + WORD x = EliasCode::Gamma(in[i] + 1, l) ; + Utils::BitsWrite(M, sumL, sumL + l - 1, x) ; + + //int tmpl ; + //printf("i=%d: in[i]=%d encode=%lld l=%d. sumL=%d. decode=%d\n", i, in[i], x, l, sumL, + // EliasCode::ReadOneGamma(M, sumL, tmpl)); + sumL += l ; + } + } + + unsigned int Read(size_t i) + { + size_t pi = i / b ; // index in P + size_t j = pi * b ; + size_t offset = P[pi] ; + int ret = 1 ; + int l ; + for (; j <= i ; ++j) + { + ret = EliasCode::ReadOneGamma(M, offset, l) ; + offset += l ; + } + + return ret - 1; + } + + int GetSpace() + { + return space + sizeof(*this) ; + } +} ; +} +#endif diff --git a/compactds/bitvector_benchmark.cpp b/compactds/bitvector_benchmark.cpp new file mode 100644 index 0000000..7b42638 --- /dev/null +++ b/compactds/bitvector_benchmark.cpp @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include "Bitvector_Plain.hpp" +#include "Bitvector_Sparse.hpp" +#include "DS_Select_Test.hpp" + +using namespace std ; +using namespace std::chrono ; +using timer = std::chrono::high_resolution_clock; +using namespace compactds ; + +const int n = 800000000 ; +const int reps = 10000000 ; + +void set_random_bits(std::vector &v, int seed) +{ + std::mt19937_64 rng; + if (0 == seed) { + rng.seed(std::chrono::system_clock::now().time_since_epoch().count()); + } else + rng.seed(seed); + + size_t *data = v.data() ; + size_t size = v.size() ; + *data = rng(); + for (size_t i=1; i < size; ++i) { + *(++data) = rng(); + } +} + +std::vector rnd_positions(uint8_t log_s, uint64_t& mask, uint64_t mod=0, uint64_t seed=17) +{ + mask = (1< rands(1< 0) { + size_t i ; + size_t size = rands.size() ; + for (i = 0 ; i < size ; ++i) + rands[i] %= mod ; + } + return rands; +} + + +int main(int argc, char *argv[]) +{ + size_t i ; + auto start = timer::now(); + Bitvector_Plain bv ; + WORD *b = Utils::MallocByBits(n) ; + + std::mt19937_64 rng; + std::uniform_int_distribution distribution(0, n-1); + auto dice = bind(distribution, rng); + + // populate vectors with some other bits + for (i=0; i < n/25; ++i) { + uint64_t x = dice(); + Utils::BitSet(b, x) ; + } + auto stop = timer::now(); + cout << "initialization in (ms): " << duration_cast(stop-start).count() << endl; + + cout << "size in byptes: " << Utils::BitsToWordBytes(n) << endl; + + start = timer::now(); + bv.SetSelectTypeSupport(3) ; + if (argc == 1) + bv.SetSelectSpeed(3) ; + else + bv.SetSelectSpeed(atoi(argv[1])) ; + DS_Rank9 ranktst ; + bv.Init(b, n) ; + ranktst.Init(b, n) ; + DS_Select_Test selectTst ; + selectTst.Init(0, b, n, 2, 3) ; + stop = timer::now() ; + cout << "construction in (ms): " << duration_cast(stop-start).count() << endl; + cout << "size in bytes: " << bv.GetSpace() << endl; + + auto ones = bv.Rank(1, n) ; + auto zeros = n-ones; + if (0) + { + uint64_t mask = 0; + + auto rands = rnd_positions(20, mask, zeros, 17); + for (uint64_t i=0; i +#include +#include +#include + +#include +#include +#include +#include + +#include "SequenceCompactor.hpp" +#include "Sequence_WaveletTree.hpp" +#include "Sequence_RunLength.hpp" +#include "Sequence_Hybrid.hpp" +#include "Sequence_RunBlock.hpp" +#include "FMBuilder.hpp" +#include "FMIndex.hpp" + +// Usage: ./a.out (sequence_file|bwt) [load] +using namespace std::chrono ; +using timer = std::chrono::high_resolution_clock; + +using namespace compactds ; + +int main(int argc, char *argv[]) +{ + std::string seq ; + FixedSizeElemArray s ; + + char abList[] = "ACGT" ; + FixedSizeElemArray BWT ; + size_t n = 0 ; + const size_t maxTestCnt = 10000000 ; + + if (atoi(argv[2]) == 0 || argc <= 2) + { + std::ifstream ifs(argv[1], std::ifstream::in) ; + std::getline(ifs, seq) ; + SequenceCompactor seqCompactor ; + seqCompactor.Init(abList, s, 1000000) ; + seqCompactor.Compact(seq.c_str(), s) ; + + n = s.GetSize() ; + struct _FMBuilderParam param ; + struct _FMIndexAuxData fmAuxData ; + param.threadCnt = 4 ; + param.saBlockSize = n / param.threadCnt ; + + FMBuilder::InferParametersGivenMemory(n, strlen(abList), Utils::SpaceStringToBytes("24G"), param) ; + size_t firstISA = 0 ; + FMBuilder::Build(s, n, strlen(abList), + BWT, firstISA, param) ; + param.Free() ; + FILE *fp = fopen("tmp.idx", "w") ; + BWT.Save(fp) ; + fclose(fp) ; + } + else + { + FILE *fp = fopen(argv[1], "r") ; + BWT.Load(fp) ; + fclose(fp) ; + + n = BWT.GetSize() ; + } + printf("Total size: %lu\n", n) ; + + { + Sequence_WaveletTree<> plbwt ; // plain bwt + plbwt.SetSelectSpeed(0) ; + plbwt.Init(BWT, n, abList) ; + printf("Plain bwt space (bytes): %lu\n", plbwt.GetSpace()) ; + + auto start = timer::now(); + size_t check = 0 ; + size_t i ; + for (i = 0 ; i < n && i < maxTestCnt ; ++i) + { + size_t x = plbwt.Rank('A', i) ; + check += x ; + } + auto stop = timer::now(); + std::cout << "# rank time (ns) from " << i << " = " << duration_cast(stop-start).count()/(double)i << std::endl; + std::cout << "# rank sum = " << check << std::endl; + } + + { + Sequence_RunLength rlbwt ; + rlbwt.Init(BWT, n, abList) ; + rlbwt.PrintStats() ; + printf("Runlength bwt space (bytes): %lu\n", rlbwt.GetSpace()) ; + + auto start = timer::now(); + size_t check = 0 ; + size_t i ; + for (i = 0 ; i < n && i < maxTestCnt ; ++i) + { + size_t x = rlbwt.Rank('A', i) ; + check += x ; + } + auto stop = timer::now(); + std::cout << "# rank time (ns) from " << i << " = " << duration_cast(stop-start).count()/(double)i << std::endl; + std::cout << "# rank sum = " << check << std::endl; + } + + if (1) + { + Sequence_Hybrid hybbwt ; + //hybbwt.SetBlockSize(8) ; + hybbwt.Init(BWT, n, abList) ; + hybbwt.PrintStats() ; + printf("Hybrid bwt space (bytes): %lu\n", hybbwt.GetSpace()) ; + + auto start = timer::now(); + size_t check = 0 ; + size_t i ; + for (i = 0 ; i < n && i < maxTestCnt ; ++i) + { + size_t x = hybbwt.Rank('A', i) ; + check += x ; + } + auto stop = timer::now(); + std::cout << "# rank time (ns) from " << i << " = " << duration_cast(stop-start).count()/(double)i << std::endl; + std::cout << "# rank sum = " << check << std::endl; + } + + { + Sequence_RunBlock rbbwt ; + //rbbwt.SetBlockSize(5) ; + rbbwt.Init(BWT, n, abList) ; + rbbwt.PrintStats() ; + printf("RunBlock bwt space (bytes): %lu\n", rbbwt.GetSpace()) ; + + auto start = timer::now(); + size_t check = 0 ; + size_t i ; + for (i = 0 ; i < n && i < maxTestCnt ; ++i) + { + size_t x = rbbwt.Rank('A', i) ; + check += x ; + } + auto stop = timer::now(); + std::cout << "# rank time (ns) from " << i << " = " << duration_cast(stop-start).count()/(double)i << std::endl; + std::cout << "# rank sum = " << check << std::endl; + } + + return 0 ; +} diff --git a/compactds/test.cpp b/compactds/test.cpp new file mode 100644 index 0000000..cfa29ab --- /dev/null +++ b/compactds/test.cpp @@ -0,0 +1,1881 @@ +#include +#include +#include +#include + +#include "FixedSizeElemArray.hpp" +#include "FractionBitElemArray.hpp" +#include "VariableSizeElemArray_SampledPointers.hpp" +#include "VariableSizeElemArray_DensePointers.hpp" +#include "InterleavedFixedSizeElemArray.hpp" + +#include "Bitvector_Plain.hpp" +#include "Bitvector_Compressed.hpp" +#include "Bitvector_Sparse.hpp" +#include "Bitvector_RunLength.hpp" + +#include "Sequence_Plain.hpp" +#include "Sequence_WaveletTree.hpp" +#include "Sequence_RunLength.hpp" +#include "Sequence_Hybrid.hpp" +#include "Sequence_RunBlock.hpp" + +#include "PerfectHash.hpp" +#include "PartialSum.hpp" + +#include "SuffixArrayGenerator.hpp" +#include "FMBuilder.hpp" +#include "FMIndex.hpp" + +#include "DS_InvPermutation.hpp" +#include "Permutation.hpp" +#include "InvertedIndex.hpp" + +#include "DS_Parenthesis.hpp" +#include "DS_PatternRankSelect.hpp" + +#include "Tree_Plain.hpp" +#include "Tree_LOUDS.hpp" +#include "Tree_BP.hpp" +#include "Tree_DFUDS.hpp" + +#include "Tree_Cardinal_Plain.hpp" +#include "Tree_Cardinal_LOUDS.hpp" +#include "Tree_Cardinal_Ordinal.hpp" + +#include "Tree_Labeled.hpp" + +using namespace compactds ; + +void PrintLog( const char *fmt, ... ) +{ + va_list args ; + va_start( args, fmt ) ; + char buffer[500] ; + vsprintf( buffer, fmt, args ) ; + + time_t mytime = time(NULL) ; + struct tm *localT = localtime( &mytime ) ; + char stime[500] ; + strftime( stime, sizeof( stime ), "%c", localT ) ; + fprintf( stderr, "[%s] %s\n", stime, buffer ) ; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) + { + fprintf(stderr, "Usage: ./test test_case\n") ; + exit(1) ; + } + + size_t i ; + unsigned int mismatchCnt = 0 ; + //int array[] = {20, 18, 22, 22, 16, 21, 11, 22, 21, 21, 5, 7, 31, 0, 3} ; + //int array[] = {0xfffffff} ; + if (!strcmp(argv[1], "array")) + { + /*int array[] = {0, 0xfff, 0, 1, 2, 3, 4, 5, 6, 8, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} ;*/ + //int array[] = {0, 1, 2} ; + const int n = 1000 ; + unsigned int array[n] ; + for (i = 0 ; i < n ; ++i) + array[i] = (i * 7 + 3)%3 ; // trits + unsigned int len = sizeof(array) / sizeof(array[0]) ; + printf("Raw size: %d\n\n", (int)sizeof(array)) ; + + { + FixedSizeElemArray fsea ; + //B.Malloc(5, len) ; + //for (i = 0 ; i < len ; ++i) + // B.Write(i, array[i]) ; + fsea.InitFromArray(-1, array, len) ; + mismatchCnt = 0 ; + printf("Fixed-size element array:\n") ; + for (i = 0 ; i < len ; ++i) + { + if (fsea.Read(i) != array[i]) + { + ++mismatchCnt ; + } + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)fsea.GetSpace()); + } + + { + FixedSizeElemArray fsea ; + //B.Malloc(5, len) ; + //for (i = 0 ; i < len ; ++i) + // B.Write(i, array[i]) ; + fsea.InitFromArray(-1, array, len) ; + + FILE *fp = fopen("tmp.out", "w") ; + fsea.Save(fp) ; + fclose(fp) ; + + fp = fopen("tmp.out", "r") ; + fsea.Load(fp) ; + fclose(fp) ; + + mismatchCnt = 0 ; + printf("\nFixed-size element array load/save:\n") ; + for (i = 0 ; i < len ; ++i) + { + if (fsea.Read(i) != array[i]) + { + ++mismatchCnt ; + } + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)fsea.GetSpace()); + } + + { + FixedSizeElemArray fsea ; + //B.Malloc(5, len) ; + //for (i = 0 ; i < len ; ++i) + // B.Write(i, array[i]) ; + fsea.Malloc(2, 0); + fsea.Reserve(5); + for (i = 0 ; i < len ; ++i) + fsea.PushBack(array[i]); + mismatchCnt = 0 ; + printf("\nFixed-size element array with push back:\n") ; + for (i = 0 ; i < len ; ++i) + { + if (fsea.Read(i) != array[i]) + { + ++mismatchCnt ; + } + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)fsea.GetSpace()); + } + + FractionBitElemArray fbea ; + fbea.InitFromArray(0, array, len) ; + printf("\nFraction bits element array:\n") ; + mismatchCnt = 0 ; + printf("Fixed-size element array:\n") ; + for (i = 0 ; i < len ; ++i) + { + if (fbea.Read(i) != array[i]) + ++mismatchCnt ; + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)fbea.GetSpace()); + + int blockSize = -1 ; + VariableSizeElemArray_SampledPointers vseasp ; + vseasp.InitFromArray(blockSize, array, len) ; + printf("\nSampled pointers:\n") ; + mismatchCnt = 0 ; + for (i = 0 ; i < len ; ++i) + { + if (vseasp.Read(i) != array[i]) + ++mismatchCnt ; + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)vseasp.GetSpace()); + + VariableSizeElemArray_DensePointers vseadp ; + vseadp.InitFromArray(blockSize, array, len) ; + printf("\nDense pointers:\n") ; + mismatchCnt = 0 ; + for (i = 0 ; i < len ; ++i) + { + if (vseadp.Read(i) != array[i]) + ++mismatchCnt ; + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)vseadp.GetSpace()); + + { + printf("\nInterleaved array:\n") ; + ILArray il ; + int block = 3 ; + il.Malloc(2, DIV_CEIL(n, block), 2, block - 1) ; + for (i = 0 ; i < len ; ++i) + { + if (i%block == 0) + il.Write(0, i / block, array[i]) ; + else + il.Write(1, i - i / block, array[i]) ; + } + mismatchCnt = 0 ; + for (i = 0 ; i < len ; ++i) + { + if (i%block == 0) + { + if (il.Read(0, i / block) != array[i]) + ++mismatchCnt ; + } + else + { + if (il.Read(1, i - i / block) != array[i]) + ++mismatchCnt ; + } + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)il.GetSpace()); + } + + { + printf("\nInterleaved64 array:\n") ; + IL64Array il64 ; + int block = 3 ; + il64.Malloc(DIV_CEIL(n, block), 2, block - 1) ; + for (i = 0 ; i < len ; ++i) + { + if (i%block == 0) + il64.Write0(i / block, array[i]) ; + else + il64.Write1(i - i / block, array[i]) ; + } + mismatchCnt = 0 ; + for (i = 0 ; i < len ; ++i) + { + if (i%block == 0) + { + if (il64.Read0(i / block) != array[i]) + ++mismatchCnt ; + } + else + { + if (il64.Read1(i - i / block) != array[i]) + ++mismatchCnt ; + } + } + printf("mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (bytes): %d\n", (int)il64.GetSpace()); + } + } +#if 0 // comment out large chunk of the code for compile efficiency. Remove this in future + else if (!strcmp(argv[1], "bitvector")) + { + int k = 0 ; + unsigned int sum ; + WORD *B ; + size_t n = 1000000 ; + + B = Utils::MallocByBits(n) ; + + //for (i = 0 ; i*1 < n ; ++i ) + // Utils::BitSet(B, i*1) ; + for (i = 0 ; i < n ; ) + { + int rlen = 5 ; + if (argc > 2) + rlen = atoi(argv[2]) ; + for (int j = 0 ; j < rlen ; ++j) + Utils::BitSet(B, i + j) ; + i += 4 * rlen ; + } + /*for (i = 0 ; i < n ; ++i) + { + if (rand() & 1) + Utils::BitSet(B, i) ; + }*/ + + printf("Raw size: %d\n", (int)DIV_CEIL(n, 8)) ; + + //------ + { + PrintLog("Plain bitvector:") ; + Bitvector_Plain bvp ; + bvp.SetSelectSpeed(1) ; + bvp.Init(B, n) ; + for (i = 0 ; i < n ; ++i) + { + if (bvp.Access(i) != Utils::BitRead(B, i)) + ++mismatchCnt ; + //printf("%d %d\n", bvc.Access(i), array.Read(i)) ; + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + ++sum ; + if (bvp.Rank(1, i) != sum) + ++mismatchCnt ; + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + for (int type = 1 ; type >= 1 ; --type) + { + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == type) + { + size_t s = bvp.Select(type, k + 1) ; + if (s != i) + { + ++mismatchCnt ; + //printf("mismatch %d: %d %d\n", k + 1, s, i) ; + } + ++k ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + for (k = i ; k >= 0 ; --k) + if (bvp.Access(k) == 1) + break ; + if (k < 0) + break ; + if ((int)bvp.Pred(i) != k) + ++mismatchCnt ; + + } + printf("Pred mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + for (k = i ; k < (int)n ; ++k) + if (bvp.Access(k) == 1) + break ; + if (k >= (int)n) + break ; + if ((int)bvp.Succ(i) != k) + ++mismatchCnt ; + } + printf("Succ mismatch count: %d\n", mismatchCnt) ; + + printf("Space usage (byptes): %d\n\n", (int)bvp.GetSpace()) ; + } + + + // ------ + { + PrintLog("Compressed bitvector:") ; + Bitvector_Compressed bvc ; + bvc.Init(B, n) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (bvc.Access(i) != Utils::BitRead(B, i)) + ++mismatchCnt ; + //printf("%d %d\n", bvc.Access(i), array.Read(i)) ; + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + ++sum ; + if (bvc.Rank(1, i/*, inclusive=1*/) != sum) + ++mismatchCnt ; + //printf("%d %d\n", bvc.Rank(i), sum) ; + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + /*mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + { + if (bvc.Select(k + 1) != i) + ++mismatchCnt ; + ++k ; + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ;*/ + printf("Space usage (byptes): %d\n\n", (int)bvc.GetSpace()) ; + } + //----- + { + PrintLog("Sparse bitvector:") ; + Bitvector_Sparse bvs ; + bvs.Init(B, n) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (bvs.Access(i) != Utils::BitRead(B, i)) + { + ++mismatchCnt ; + //printf("%d: %d %d\n", i, bvs.Access(i), Utils::BitRead(B, i)) ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + ++sum ; + if (bvs.Rank(1, i) != sum) + { + ++mismatchCnt ; + //printf("compare %d: %d %d\n", i, bvs.Rank(1, i), sum) ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + { + if (bvs.Select(k + 1) != i) + ++mismatchCnt ; + ++k ; + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (byptes): %d\n\n", (int)bvs.GetSpace()) ; + } + + if (1) + { + PrintLog("Sparse bitvector load/save:") ; + Bitvector_Sparse bvs ; + bvs.Init(B, n) ; + + FILE *fp = fopen("tmp.out", "w") ; + bvs.Save(fp) ; + fclose(fp) ; + + fp = fopen("tmp.out", "r") ; + bvs.Load(fp) ; + fclose(fp) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (bvs.Access(i) != Utils::BitRead(B, i)) + { + ++mismatchCnt ; + //printf("%d: %d %d\n", i, bvs.Access(i), Utils::BitRead(B, i)) ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + ++sum ; + if (bvs.Rank(1, i) != sum) + { + ++mismatchCnt ; + //printf("compare %d: %d %d\n", i, bvs.Rank(1, i), sum) ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + { + if (bvs.Select(k + 1) != i) + ++mismatchCnt ; + ++k ; + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (byptes): %d\n\n", (int)bvs.GetSpace()) ; + } + + //----- + { + PrintLog("Run-length bitvector:") ; + Bitvector_RunLength bvr ; + bvr.Init(B, n) ; + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (bvr.Access(i) != Utils::BitRead(B, i)) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + ++sum ; + if (bvr.Rank(1, i) != sum) + { + ++mismatchCnt ; + //printf("compare %d: %d %d\n", i, bvr.Rank(1, i), sum) ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (Utils::BitRead(B, i) == 1) + { + if (bvr.Select(k + 1) != i) + { + //printf("compare %d: %d %d\n", k, bvr.Select(k + 1), i) ; + ++mismatchCnt ; + } + ++k ; + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ; + printf("Space usage (byptes): %d\n\n", (int)bvr.GetSpace()) ; + } + free(B) ; + } + else if (!strcmp(argv[1], "sequence")) + { + char abList[] = "ACGT" ; + Alphabet abCode ; + abCode.InitFromList(abList, strlen(abList)) ; + + size_t n = 1000000 ; + FixedSizeElemArray S ; + S.Malloc(2, n) ; + + if (1) + { + //FILE *fp = fopen("testdata/bwt_1M.out", "r") ; + FILE *fp = fopen("testdata/bwt_7M-8M.out", "r") ; + //FILE *fp = fopen("testdata/bwt_2M.out", "r") ; + //FILE *fp = fopen("testdata/tmp.out", "r") ; + int t ; + for (i = 0 ; i < n ; ++i) + { + fscanf(fp, "%d", &t) ; + S.Write(i, t) ; + } + fclose(fp) ; + } + else + { + /*srand(1) ; + for (i = 0 ; i < n ; ++i) + { + S.Write(i, rand()%4) ; + }*/ + size_t rlen = 5 ; + if (argc > 2) + rlen = atoi(argv[2]) ; + uint8_t prevc = -1 ; + for (i = 0 ; i < n ; i += rlen) + { + uint8_t c = rand() % 4; + while (c == prevc) + c = rand() % 4 ; + for (size_t j = 0 ; j < rlen ; ++j) + S.Write(i + j, c) ; + prevc = c ; + } + } + + printf("Raw size: %d\n", (int)S.GetSpace()) ; + + if (0) + { + printf("\nPlain+Bitvector_Plain\n") ; + Sequence_Plain t ; + t.SetAlphabet(abCode) ; + t.Init(S, n, abList ) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + if (t.Rank(abList[j], i) != sum) + ++mismatchCnt ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + if (t.Select(abList[j], cnt) != i) + { + //printf("%d: %d %d %d\n", (int)j, cnt, t.Select(abList[j], cnt), i) ; + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ; + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + + if (1) + { + printf("\nWavelet tree + plain bitvector:\n") ; + Sequence_WaveletTree<> t ; + t.SetAlphabet(abCode) ; + t.Init(S, n, abList ) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + if (t.Rank(abList[j], i) != sum) + ++mismatchCnt ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ; + if (t.Select(abList[j], cnt) != i) + { + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ; + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + + if (0) + { + printf("\nsave/load:\n") ; + Sequence_Hybrid t ; + t.SetAlphabet(abCode) ; + t.Init(S, n, abList) ; + + FILE *fp = fopen("tmp.out", "w") ; + t.Save(fp) ; + fclose(fp) ; + + fp = fopen("tmp.out", "r") ; + t.Load(fp) ; + fclose(fp) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + if (t.Rank(abList[j], i) != sum) + ++mismatchCnt ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + /*mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ; + if (t.Select(abList[j], cnt) != i) + { + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ;*/ + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + if (1) + { + printf("\nWavelet tree + run-length bitvector:\n") ; + Sequence_WaveletTree t ; + t.SetAlphabet(abCode) ; + t.Init(S, n, abList ) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + if (t.Rank(abList[j], i) != sum) + ++mismatchCnt ; + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + /*mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ; + if (t.Select(abList[j], cnt) != i) + { + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ;*/ + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + + { + printf("\nRun length:\n") ; + Sequence_RunLength t ; + //t.SetAlphabet(abCode) ; + t.Init(S, n, abList ) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + //printf("%d: %d %d %d\n", j, sum, t.Rank(abList[j], i)) ; + if (t.Rank(abList[j], i) != sum) + { + //printf("ERROR\n") ; + ++mismatchCnt ; + } + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + /*mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ; + if (t.Select(abList[j], cnt) != i) + { + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ;*/ + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + + { + printf("\nHybrid:\n") ; + Sequence_Hybrid t ; + //t.SetAlphabet(abCode) ; + t.SetBlockSize(8) ; + t.Init(S, n, abList ) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + //printf("%d: %d %d %d\n", i, j, sum, t.Rank(abList[j], i)) ; + if (t.Rank(abList[j], i) != sum) + { + //printf("ERROR\n") ; + ++mismatchCnt ; + } + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + /*mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ; + if (t.Select(abList[j], cnt) != i) + { + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ;*/ + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + { + printf("\nRunBlock:\n") ; + Sequence_RunBlock t ; + //t.SetAlphabet(abCode) ; + t.Init(S, n, abList ) ; + + /*FILE *fp = fopen("tmp.out", "w") ; + t.Save(fp) ; + fclose(fp) ; + + fp = fopen("tmp.out", "r") ; + t.Load(fp) ; + fclose(fp) ;*/ + + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (t.Access(i) != abList[S.Read(i)]) + { + ++mismatchCnt ; + } + } + printf("Access mismatch count: %d\n", mismatchCnt) ; + + size_t j = 0 ; + mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + uint64_t sum = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (S.Read(i) == j) + ++sum ; + //printf("%d: %d %d %d\n", i, j, sum, t.Rank(abList[j], i)) ; + if (t.Rank(abList[j], i) != sum) + { + //printf("ERROR\n") ; + ++mismatchCnt ; + } + } + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + /*mismatchCnt = 0 ; + for (j = 0 ; abList[j] ; ++j) + { + int cnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (S.Read(i) == j) + { + ++cnt ; + //printf("%d: %d %d %d\n", j, cnt, t.Select(abList[j], cnt), i) ; + if (t.Select(abList[j], cnt) != i) + { + ++mismatchCnt ; + } + } + } + printf("Select mismatch count: %d\n", mismatchCnt) ;*/ + + printf("Space usage (byptes): %d\n\n", (int)t.GetSpace()) ; + } + } + else if (!strcmp(argv[1], "hash")) + { + const int n = 20 ; + uint64_t array[n] ; + + for (i = 0 ; i < n ; ++i) + array[i] = i ; + UniversalHashGenerator uh ; + uint64_t a, b ; + int j ; + uh.Init(2 * n, /*seed=*/0) ; + printf("Universal hash:\n") ; + for (j = 0 ; j < 3 ; ++j) + { + uh.Generate(a, b) ; + printf("Hash%d %llu %llu\n", j, (long long unsigned)a, (long long unsigned)b) ; + for (i = 0 ; i < n ; ++i) + printf("%d ", (int)uh.Map(a, b, array[i])) ; + printf("\n") ; + } + printf("\n") ; + + PerfectHash perfhash ; + perfhash.Init(array, n, /*m=*/0) ; + printf("Perfect hash:\n") ; + for (i = 0 ; i < n ; ++i) + { + printf("%d ", (int)perfhash.Map(array[i])) ; + } + printf("\n") ; + printf("Space usage (bytes): %d\n", (int)perfhash.GetSpace()) ; + } + else if (!strcmp(argv[1], "huffman")) + { + const int n = 4 ; + uint64_t freq[n] = {5, 10, 100, 1}; + HuffmanCode huffmanCode ; + huffmanCode.InitFromFrequency(freq, n) ; + + printf("Huffman code:\n") ; + for (i = 0 ; i < n ; ++i) + { + int l = 0 ; + WORD code = huffmanCode.Encode(i, l) ; + printf("%d %d: %llu %d => %d\n", (int)i, (int)freq[i], (long long unsigned)code, l, + huffmanCode.Decode(code, l)) ; + } + printf("Space usage (bytes): %d\n", (int)huffmanCode.GetSpace()) ; + } + else if (!strcmp(argv[1], "partialsum")) + { + const int n = 100 ; + int array[n] ;//= {0, 0, 0}; + for (i = 0 ; i < n ; ++i) + array[i] = i ; + array[10] = 0 ; + + PartialSum psum ; + psum.Init(array, n) ; + + printf("Succinct partial sum:\n") ; + int s = 0 ; + mismatchCnt = 0 ; + for (i = 0 ; i <= n ; ++i) + { + if (s != (int)psum.Sum(i)) + { + ++mismatchCnt ; + } + s += array[i] ; + } + printf("Sum query mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + int j ; + s = 0 ; + for (i = 0 ; i < n ; ++i) + { + for (j = 0 ; j < array[i] ; ++j) + { + if (psum.Search(s + j) != i) + ++mismatchCnt ; + } + s += array[i] ; + } + printf("Search mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (psum.AccessValue(i) != array[i]) + ++mismatchCnt ; + } + printf("AccessValue mismatch count: %d\n", mismatchCnt) ; + + printf("Space usage (bytes): %d\n", (int)psum.GetSpace()) ; + } + else if (!strcmp(argv[1], "sa")) + { + FixedSizeElemArray s ; + size_t n = 10000 ; + s.Malloc(2, n) ; + srand(1) ; + for (i = 0 ; i < (size_t)n ; ++i) + { + s.Write(i, rand() % 4) ; + //s.Write(i, i % 4) ; + //printf("%d ", s.Read(i)) ; + } + //printf("\n") ; + /*std::vector truth ; + for (j = 0 ; j <= 3 ; ++j) + { + int i ; + for (i = n - 1 - (n - 1) % 4 + j ; i >= 0 ; i -= 4) + { + if (i >= n) + continue ; + truth.push_back(i) ; + } + }*/ + + // Check the cuts + SuffixArrayGenerator saGenerator ; + size_t cutCnt = saGenerator.Init(s, n, n / 4, /*diffcov_v=*/4096, 4) ; + /*for (i = 0 ; i < cutCnt ; ++i) + { + std::vector< std::vector > pos = saGenerator.GetChunksPositions(s, n, i, i) ; + //printf("%d\n", pos[0].size()) ; + int size = pos[0].size() ; + int j ; + for (j = 0 ; j < size ; ++j) + printf("%d ", pos[0][j]) ; + printf("\n") ; + }*/ + size_t *sa = (size_t *)malloc(sizeof(size_t) * n); + size_t calculated = 0 ; + for (i = 0 ; i < cutCnt ; ++i) + { + std::vector< std::vector > pos ; + saGenerator.GetChunksPositions(s, n, i, i, 0, n - 1, pos) ; + int size = pos[0].size() ; + printf("%lu %d. %lu\n", i, size, calculated) ; + saGenerator.SortSuffixByPos(s, n, pos[0].data(), size, sa + calculated) ; + calculated += size ; + } + printf("Validate result: %d\n", saGenerator.ValidateSA(s, n, sa)) ; + free(sa) ; + + /*mismatchCnt = 0 ; + for (i = 0 ; i < (size_t)n ; ++i) + { + if (truth[i] != sa[i]) + ++mismatchCnt ; + //printf("%d %d\n", (int)truth[i], (int)sa[i]) ; + } + printf("SA mismatch: %d\n", mismatchCnt) ;*/ + + } + else if (!strcmp(argv[1], "fm")) + { + FixedSizeElemArray s ; + const size_t n = 10000 ; + const size_t testLen = 50 ; + s.Malloc(2, n) ; + char strs[n + 1] ; + srand(1) ; + char abList[] = "ACGT" ; + for (i = 0 ; i < (size_t)n ; ++i) + { + int r = rand() % 4 ; + s.Write(i, r) ; + strs[i] = abList[r] ; + } + //s.Print(stdout) ; + //printf("%s\n", strs) ; + struct _FMBuilderParam param ; + struct _FMIndexAuxData fmAuxData ; + param.threadCnt = 4 ; + param.saBlockSize = n / 4 ; + FixedSizeElemArray BWT ; + param.precomputeWidth = testLen > 10 ? 10 : testLen ; + param.maxLcp = 17 ; + + size_t firstISA = 0 ; + param.selectedISA[0] = 0 ; + param.selectedISA[1] = 0 ; + FMBuilder::Build(s, n, 4, BWT, firstISA, param) ; + + Sequence_RunBlock t ; + t.Init(BWT, n, abList) ; + + //BWT.Print(stdout) ; + // + //printf("%d %d\n", precomputedRange[0].first, precomputedRange[0].second) ; + /*for (i = 0 ; i < 100 ; ++i) + { + if (precomputedRange[i].second > 0) + printf("%d %d\n", precomputedRange[i].first, precomputedRange[i].second) ; + }*/ + size_t count = 0 ; + for (i = 0 ; i < n ; i += WORDBITS) + count += Utils::Popcount(param.semiLcpGreater[i / WORDBITS]) ; + printf("Number of 1s in semiLcpGreater: %lu\n", count) ; + + count = 0 ; + for (i = 0 ; i < n ; i += WORDBITS) + count += Utils::Popcount(param.semiLcpEqual[i / WORDBITS]) ; + printf("Number of 1s in semiLcpEqual: %lu\n", count) ; + + FMIndex< Sequence_WaveletTree > fmIndex ; + //FMIndex< Sequence_Plain > fmIndex ; + //FMIndex< Sequence_RunBlock > fmIndex ; + fmIndex.Init(BWT, n, firstISA, + param, + abList, strlen(abList)) ; + printf("firstISA = %lu; lastISA = %lu\n", firstISA, fmIndex.GetLastISA()) ; + + size_t sp, ep, l ; + char test[testLen + 1] ; + test[testLen] = '\0' ; + size_t k ; + size_t mismatchCnt = 0 ; + size_t compareCnt = 0 ; + for (k = 0 ; k + testLen <= n; ++k) + { + memcpy(test, strs + k, testLen) ; + //strcpy(test, "GATGGAGATG") ; + //printf("test: %s\n", test) ; + l = fmIndex.BackwardSearch(test, strlen(test), sp, ep) ; + //printf("Backward search %d %d %d\n", l, sp, ep) ; + if (sp < ep) + continue ; + ++compareCnt ; + for (i = sp ; i <= ep ; ++i) + { + size_t sa = fmIndex.BackwardToSampledSA(i, l) ; + if (sa + l != k) + { + ++mismatchCnt ; + printf("SA[%lu] = %lu+%lu. %lu\n", i, sa, l, k) ; + } + } + } + printf("Mismatch count: %lu out of %lu\n", mismatchCnt, compareCnt) ; + + /*FILE *fp = fopen("tmp.out", "w") ; + fmIndex.Save(fp) ; + fclose(fp) ; + + fp = fopen("tmp.out", "r") ; + fmIndex.Load(fp) ; + fclose(fp) ; + + printf("Save/Load:\n") ; + l = fmIndex.BackwardSearch(test, strlen(test), sp, ep) ; + printf("Backward search %d %d %d\n", l, sp, ep) ; + for (i = sp ; i <= ep ; ++i) + { + size_t sa = fmIndex.BackwardToSampledSA(i, l) ; + printf("SA[%d] = %d+%d\n", i, sa, l) ; + }*/ + + + //free(sampledSa) ; + //free(precomputedRange) ; + //free(semiLcpGreater) ; + //free(semiLcpEqual) ; + } + else if (!strcmp(argv[1], "diffcover")) + { + DifferenceCover dc ; + unsigned int v = 4096 ; + dc.Init(v) ; + size_t j ; + mismatchCnt = 0 ; + for (i = 0 ; i < v ; ++i) + for (j = 0 ; j < v ; ++j) + { + int d = dc.Delta(i, j) ; + if (!dc.IsInDC(i + d) || !dc.IsInDC(j + d)) + { + ++mismatchCnt ; + } + } + printf("%d\n", mismatchCnt) ; + } + else if (!strcmp(argv[1], "permutation")) + { + const int n = 1000; + size_t *perm = new size_t[n]; + size_t *inv = new size_t[n] ; + for (i = 0 ; i < n ; ++i) + perm[i] = (i + 1)%n ; + printf("Raw permutation size %lu\n", n * sizeof(perm[0])) ; + for (i = 0 ; i < n ; ++i) + { + if (0) + { + size_t tmp ; + size_t j = i + rand() % (n - i) ; + tmp = perm[j] ; + perm[j] = perm[i] ; + perm[i] = tmp ; + } + //perm[i] = (i*10001+1)%n ; + inv[ perm[i] ] = i ; + } + //for (i = 0 ; i < n ; ++i) + // printf("%d ", perm[i]) ; + //printf("\n") ; + + { + printf("\ninverse permutation\n") ; + DS_InvPermutation invperm ; + invperm.Init(perm, n) ; + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + if (invperm.Query(perm, i) != inv[i]) + ++mismatchCnt ; + printf("Inverse mismatch count %d\n", mismatchCnt) ; + printf("Space usage: %d\n", (int)invperm.GetSpace()) ; + } + + { + printf("\ncompressed permutation\n") ; + Permutation cperm ; + cperm.Init(perm, n) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (cperm.Next(i) != perm[i]) + ++mismatchCnt ; + } + printf("Next mismatch count %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (cperm.Prev(i) != inv[i]) + ++mismatchCnt ; + } + printf("Prev mismatch count %d\n", mismatchCnt) ; + + printf("Space usage: %d\n", (int)cperm.GetSpace()) ; + } + + delete[] perm ; + delete[] inv ; + } + else if (!strcmp(argv[1], "invindex")) + { + size_t n = 10000 ; + FixedSizeElemArray a ; + a.Malloc(3, n) ; + size_t i ; + srand(17) ; + int stride = 5 ; + for (i = 0 ; i < n ; ++i) + { + a.Write(i, i%stride) ; + } + + InvertedIndex idx ; + idx.Init(a, n, false) ; + size_t mismatchCnt = 0 ; + printf("Raw sequence space usage %lu\n", a.GetSpace()) ; + int label = 1 ; + for (i = label ; i < n ; i += stride) + { + if (idx.Search(label, i / stride) != i) + { + //printf("%lu %lu\n", i, idx.Search(label, i / stride)) ; + ++mismatchCnt ; + } + } + printf("Inverted index mismatch count %lu\n", mismatchCnt) ; + printf("Inverted index based on permutation space usage %lu\n", idx.GetSpace()) ; + } +#endif + else if (!strcmp(argv[1], "rmmtree")) + { + int n = 1000000 ; + int i, j ; + WORD *B = Utils::MallocByBits(n) ; + printf("Raw representation space usage: %lu\n", Utils::BitsToWordBytes(n)) ; + srand(1) ; + for (i = 0 ; i < n ; ++i) + { + if (rand() & 1) + //if (i < n / 2) + //if (i % 2 == 0) + Utils::BitSet(B, i) ; + } + + DS_RangeMinMaxTree rmmTree ; + rmmTree.SetBlockSize(32) ; + rmmTree.Init(B, n) ; + + // Test forward and backward search + if (0) + { + int d ; + int stride = 11 ; + for (d = -stride ; d <= stride ; d += 2 * stride) + { + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + int excess = 0 ; + for (j = i ; j < n ; ++j) + { + excess += 2 * Utils::BitRead(B, j) - 1 ; + if (excess == d) + break ; + } + int truth = j ; + j = rmmTree.FwdSearch(i, d, B, n) ; + if (j != truth) + ++mismatchCnt ; + //if (j != truth) + // printf("%d %d %d\n", i, truth, j) ; + } + printf("FwdSearch %d mismatch count %u\n", d, mismatchCnt) ; + } + + for (d = -stride ; d <= stride ; d += 2 * stride) + { + mismatchCnt = 0 ; + for (i = 0 ; i < n ; ++i) + { + int excess = 0 ; + for (j = i ; j >= 0 ; --j) + { + excess -= (2 * Utils::BitRead(B, j) - 1) ; + if (excess == d) + break ; + } + int truth = j ; + if (truth == -1) + truth = n ; + j = rmmTree.BwdSearch(i, d, B, n) ; + if (j != truth) + ++mismatchCnt ; + //if (j != truth) + // printf("%d %d %d\n", i, truth, j) ; + } + printf("BwdSearch %d mismatch count %u\n", d, mismatchCnt) ; + } + } + + // Test rmq and rMq + { + int len = 10000 ; + mismatchCnt = 0 ; + for (i = 0 ; i + len <= n ; ++i) + { + int excess = 0 ; + int min = 2 ; + int mintag = i, maxtag = i; + int max = -2 ; + int minCnt = 0 ; + int lastMinTag = 0 ; + for (j = i ; j < i + len ; ++j) + { + excess += (2 * Utils::BitRead(B, j) - 1) ; + if (excess < min) + { + min = excess ; + mintag = j ; + minCnt = 1 ; + lastMinTag = j ; + } + else if (excess == min) + { + ++minCnt ; + lastMinTag = j ; + } + + if (excess > max) + { + max = excess ; + maxtag = j ; + } + } + + //printf("%d %d\n", rmmTree.ExtremeExcess(B, n, i, i + len - 1, 0), min) ; + if (rmmTree.ExtremeExcess(i, i + len - 1, 0, B, n) != min) + { + ++mismatchCnt ; + //printf("min mismatch %d\n", i) ; + } + if (rmmTree.ExtremeExcess(i, i + len - 1, 1, B, n) != max) + { + ++mismatchCnt ; + //printf("max mismatch %d\n", i) ; + } + + if ((int)rmmTree.Rmq(i, i + len - 1, B, n) != mintag) + { + ++mismatchCnt ; + } + + if ((int)rmmTree.RMq(i, i + len - 1, B, n) != maxtag) + { + ++mismatchCnt ; + } + + if ((int)rmmTree.MinCount(i, i + len - 1, B, n) != minCnt) + { + ++mismatchCnt ; + //printf("min count mismatch %d: %d %d\n", i, min, minCnt) ; + } + + if ((int)rmmTree.MinSelect(i, i + len - 1, minCnt, B, n) != lastMinTag) + { + ++mismatchCnt ; + //printf("min select mismatch %d: %d %d %d\n", i, min, minCnt, lastMinTag) ; + } + } + printf("extreme excess mismatch count %u\n", mismatchCnt) ; + } + + printf("rmmTree space usage (bytes): %lu\n", rmmTree.GetSpace(true)) ; + } + else if (!strcmp(argv[1], "tree")) + { + // Test example: tree with child count 2 (from root), 3, 4, 5, .... + // Or a binary tree + int i, j ; + Tree_Plain tree ; + tree.Init() ; + + int internalN = 10000 ; + srand(1) ; + for (i = 0 ; i < internalN ; ++i) + { + int childCnt = rand() % 4 + 1 ; + for (j = 0 ; j < childCnt ; ++j) + { + size_t tid = tree.AddNode(i) ; + tree.SetLabel(tid, childCnt) ; + } + } + + size_t *map = new size_t[tree.GetSize()] ; + + if (0) + { + // This test is for binary tree. + mismatchCnt = 0 ; + for (i = 0 ; i < internalN ; ++i) + { + if (tree.ChildrenCount(i) != 2 || (int)tree.FirstChild(i) != (2 * i + 1) + || (int)tree.LastChild(i) != (2 * i + 2)) + ++mismatchCnt ; + } + printf("plain tree mismatch count %u\n", mismatchCnt) ; + //printf("plain tree space usage (bytes): %lu\n", tree.GetSpace(true)) ; + } + printf("plain tree space usage (bytes): %lu\n", tree.GetSpace(true)) ; + + { + Tree_LOUDS t ; + mismatchCnt = 0 ; + + t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ; + for (i = 0 ; i < internalN ; ++i) + { + if (tree.IsLeaf(i)) + continue ; + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenCount(v) != tree.ChildrenCount(i) + || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] + || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)] + || t.ChildRank(v) != tree.ChildRank(i) + || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)]) + || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)]) + || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)] + || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)] + ) + ++mismatchCnt ; + } + printf("\nLOUDS tree mismatch count %u\n", mismatchCnt) ; + printf("LOUDS tree space usage (bytes): %lu\n", t.GetSpace()) ; + } + + { + Tree_BP t ; + mismatchCnt = 0 ; + + t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ; + for (i = 0 ; i < internalN ; ++i) + { + if (tree.IsLeaf(i)) + continue ; + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenCount(v) != tree.ChildrenCount(i) + || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] + || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)] + || t.ChildRank(v) != tree.ChildRank(i) + || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)] + || t.NodeMap(t.ChildSelect(v, 1)) != map[tree.ChildSelect(i, 1)] + || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)]) + || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)]) + || (t.IsAncestor(v, t.NodeSelect(map[internalN])) != tree.IsAncestor(i, internalN)) + || t.Depth(v) != tree.Depth(i) + || t.SubTreeSize(v) != tree.SubTreeSize(i) + || t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i) + || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)] + ) + { + ++mismatchCnt ; + //printf("%d %d. %d. %d %d. %d\n", t.LeafCountInSubTree(v), tree.LeafCountInSubTree(i), tree.GetSize(), v, i, tree.ChildrenCount(i)) ; + } + } + + for (i = internalN ; i < (int)tree.GetSize(); ++i) + { + size_t v = t.NodeSelect(map[i]) ; + if (t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i)) + //|| (int)t.LeafRank(v) != i - internalN + 1) + ++mismatchCnt ; + } + + printf("\nBP tree mismatch count %u\n", mismatchCnt) ; + printf("BP tree space usage (bytes): %lu\n", t.GetSpace()) ; + } + + { + Tree_DFUDS t ; + mismatchCnt = 0 ; + + t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ; + for (i = 0 ; i < internalN ; ++i) + { + if (tree.IsLeaf(i)) + continue ; + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenCount(v) != tree.ChildrenCount(i) + || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] + || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)] + || t.ChildRank(v) != tree.ChildRank(i) + || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)] + || t.NodeMap(t.ChildSelect(v, 1)) != map[tree.ChildSelect(i, 1)] + || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)]) + || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)]) + || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)]) + || (t.IsAncestor(v, t.NodeSelect(map[internalN])) != tree.IsAncestor(i, internalN)) + //|| t.Depth(v) != tree.Depth(i) + || t.SubTreeSize(v) != tree.SubTreeSize(i) + || t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i) + || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)] + ) + ++mismatchCnt ; + //printf("%d %d %d\n", v, + // t.NodeSelect(map[internalN]), + // t.LCA(v, t.NodeSelect(map[internalN]))) ; + //printf("%d %d. %d. %d %d. %d\n", t.LeafCountInSubTree(v), tree.LeafCountInSubTree(i), tree.GetSize(), v, i, tree.ChildrenCount(i)) ; + } + + for (i = internalN ; i < (int)tree.GetSize(); ++i) + { + size_t v = t.NodeSelect(map[i]) ; + if (t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i)) + ++mismatchCnt ; + } + printf("\nDFUDS tree mismatch count %u\n", mismatchCnt) ; + printf("DFUDS tree space usage (bytes): %lu\n", t.GetSpace()) ; + } + + { + Tree_Labeled<> t ; + mismatchCnt = 0 ; + + t.Init(tree.GetTreeData().data(), tree.GetSize(), map) ; + for (i = 0 ; i < internalN ; ++i) + { + if (tree.IsLeaf(i)) + continue ; + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenCount(v) != tree.ChildrenCount(i) + || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] + || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)] + || t.ChildRank(v) != tree.ChildRank(i) + || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)] + || t.NodeMap(t.ChildSelect(v, 1)) != map[tree.ChildSelect(i, 1)] + || (!tree.IsLastChild(i) && t.NodeMap(t.NextSibling(v)) != map[tree.NextSibling(i)]) + || (!tree.IsFirstChild(i) && t.NodeMap(t.PrevSibling(v)) != map[tree.PrevSibling(i)]) + || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)]) + || (t.IsAncestor(v, t.NodeSelect(map[internalN])) != tree.IsAncestor(i, internalN)) + //|| t.Depth(v) != tree.Depth(i) + || t.SubTreeSize(v) != tree.SubTreeSize(i) + || t.LeafCountInSubTree(v) != tree.LeafCountInSubTree(i) + || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)] + ) + ++mismatchCnt ; + //printf("%d %d %d\n", v, + // t.NodeSelect(map[internalN]), + // t.LCA(v, t.NodeSelect(map[internalN]))) ; + //printf("%d %d. %d. %d %d. %d\n", t.LeafCountInSubTree(v), tree.LeafCountInSubTree(i), tree.GetSize(), v, i, tree.ChildrenCount(i)) ; + } + + // Labels + for (i = 0 ; i < internalN ; ++i) + { + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenLabeled(v, 3) != tree.ChildrenLabeled(i, 3) + || (tree.ChildrenLabeled(i, 3 ) > 0 && t.NodeMap(t.LabeledChildSelect(v, 3, 2)) != map[tree.LabeledChildSelect(i, 3, 2)]) + || t.ChildLabel(v) != tree.ChildLabel(i) + ) + { + // printf("%d %d %d: %d %d %d\n", i, map[i], v, t.ChildrenLabeled(v, 1), tree.ChildrenLabeled(i, 1), + // tree.ChildrenCount(i)) ; + ++mismatchCnt ; + } + } + + printf("\nLabeled tree mismatch count %u\n", mismatchCnt) ; + printf("Labeled tree space usage (bytes): %lu\n", t.GetSpace()) ; + } + + delete[] map ; + } + else if (!strcmp(argv[1], "patternrs")) + { + int k = 0 ; + unsigned int sum ; + WORD *B ; + size_t n = 1000000 ; + n = 65536*32 ; + + B = Utils::MallocByBits(n) ; + + //for (i = 0 ; i*1 < n ; ++i ) + // Utils::BitSet(B, i*1) ; + /*for (i = 0 ; i < n ; ) + { + int rlen = 5 ; + if (argc > 2) + rlen = atoi(argv[2]) ; + for (int j = 0 ; j < rlen ; ++j) + Utils::BitSet(B, i + j) ; + i += 4 * rlen ; + }*/ + /*for (i = 0 ; i < n ; ++i) + { + if (rand() & 1) + Utils::BitSet(B, i) ; + }*/ + DS_Parenthesis tmp ; + tmp.GenerateRandomBalanceParenthesis(B, n) ; + + printf("Raw size: %d\n", (int)DIV_CEIL(n, 8)) ; + + WORD pat = 2 ; // binary 10 + int patLen = 2 ; + DS_PatternRankSelect patrs ; + mismatchCnt = 0 ; + sum = 0 ; + patrs.Init(B, n, pat, patLen) ; + for (i = 0 ; i < n ; ++i) + { + if (patrs.IsPattern(i, B, n)) + ++sum ; + if (patrs.Rank(i, B, n) != sum) + ++mismatchCnt ; + } + printf("Rank mismatch count: %d\n", mismatchCnt) ; + + mismatchCnt = 0 ; + k = 0 ; + for (i = 0 ; i < n ; ++i) + { + if (patrs.IsPattern(i, B, n)) + { + size_t s = patrs.Select(k + 1, B, n) ; + if (s != i) + { + ++mismatchCnt ; + //printf("mismatch %d: %d %d\n", k + 1, s, i) ; + } + ++k ; + } + } + printf("Select mismatch count: %d (%d)\n", mismatchCnt, k) ; + printf("DS_PatternRankSelect space: %lu\n", patrs.GetSpace()) ; + } + else if (!strcmp(argv[1], "cardtree")) // cardinal tree + { + // Test example: tree with child count 2 (from root), 3, 4, 5, .... + // Or a binary tree + int i, j ; + int c = 4 ; // cardinality + + Tree_Cardinal_Plain tree ; + tree.Init(c) ; + + int internalN = 10000 ; + srand(1) ; + for (i = 0 ; i < internalN ; ++i) + { + //int childCnt = rand() % c + 2 ; + //int childCnt = c ; + int step = rand() % c + 1 ; + //step = 1 ; + for (j = 0 ; j < c ; j += step) + tree.AddNode(i, j) ; + } + + size_t *map = new size_t[tree.GetSize()] ; + if (0) + { + // This test is for binary tree. + mismatchCnt = 0 ; + for (i = 0 ; i < internalN ; ++i) + { + if (tree.ChildrenCount(i) != 2 || (int)tree.FirstChild(i) != (2 * i + 1) + || (int)tree.LastChild(i) != (2 * i + 2)) + ++mismatchCnt ; + } + printf("plain cardinal tree mismatch count %u\n", mismatchCnt) ; + } + printf("plain cardinal tree space usage (bytes): %lu\n", tree.GetSpace(true)) ; + + if (1) + { + Tree_Cardinal_LOUDS<> t ; + mismatchCnt = 0 ; + + t.Init(tree.GetTreeData().data(), tree.GetSize(), c, map) ; + for (i = 0 ; i < internalN ; ++i) + { + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenCount(v) != tree.ChildrenCount(i) + || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] + || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)] + || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)] + || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)]) + || t.ChildRank(v) != tree.ChildRank(i) + || t.NodeMap(t.LCA(v, t.NodeSelect(internalN))) != map[tree.LCA(i, internalN)] + ) + ++mismatchCnt ; + //printf("%d: %d %d\n", v, t.NodeMap(t.Parent(v)), tree.Parent(i)) ; + } + printf("\nLOUDS cardinal tree mismatch count %u\n", mismatchCnt) ; + printf("LOUDS cardinal tree space usage (bytes): %lu\n", t.GetSpace()) ; + } + + { + Tree_Cardinal_Ordinal<> t ; + mismatchCnt = 0 ; + + t.Init(tree.GetTreeData().data(), tree.GetSize(), c, map) ; + for (i = 0 ; i < internalN ; ++i) + { + size_t v = t.NodeSelect(map[i]) ; + if (t.ChildrenCount(v) != tree.ChildrenCount(i) + || t.NodeMap(t.FirstChild(v)) != map[tree.FirstChild(i)] + || t.NodeMap(t.LastChild(v)) != map[tree.LastChild(i)] + || t.NodeMap(t.Parent(v)) != map[tree.Parent(i)] + || (t.ChildrenCount(v) > 1 && t.NodeMap(t.ChildSelect(v, 2)) != map[tree.ChildSelect(i, 2)]) + || t.ChildRank(v) != tree.ChildRank(i) + || t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))) != map[tree.LCA(i, internalN)] + ) + ++mismatchCnt ; + //printf("%d %d %d: %d %d\n", v, i, map[i], t.NodeMap(t.LCA(v, t.NodeSelect(map[internalN]))), map[tree.LCA(i, internalN)]) ; + } + printf("\nDFUDS cardinal tree mismatch count %u\n", mismatchCnt) ; + printf("DFUDS cardinal tree space usage (bytes): %lu\n", t.GetSpace()) ; + } + delete[] map ; + } + + PrintLog("Done") ; + return 0 ; +}