diff --git a/CentrifugerInspect.cpp b/CentrifugerInspect.cpp index 7386d09..3af464e 100644 --- a/CentrifugerInspect.cpp +++ b/CentrifugerInspect.cpp @@ -16,6 +16,7 @@ char usage[] = "./centrifuger-inspect [OPTIONS]:\n" "\t--conversion-table: print the seqID to taxonomy ID translation information\n" "\t--taxonomy-tree: print the taxonomy tree\n" "\t--name-table: print the scientific name for each strain in the database\n" + "\t--size-table: print the lengths of the sequences belonging to the same taxonomic ID" "\t--index-size: print the index information\n" "" ; @@ -27,6 +28,7 @@ static struct option long_options[] = { {"conversion-table", no_argument, 0, ARGV_CONVERSION_TABLE}, {"taxonomy-tree", no_argument, 0, ARGV_TAXONOMY_TREE}, {"name-table", no_argument, 0, ARGV_NAME_TABLE}, + {"size-table", no_argument, 0, ARGV_INSPECT_SIZE_TABLE}, {"index-size", no_argument, 0, ARGV_INSPECT_INDEXSIZE}, { (char *)0, 0, 0, 0} } ; @@ -106,6 +108,20 @@ int main(int argc, char *argv[]) { taxonomy.PrintNameTable(stdout) ; } + else if (inspectItem == ARGV_INSPECT_SIZE_TABLE) + { + size_t i ; + size_t n = taxonomy.GetNodeCount() ; + size_t *taxidLength = (size_t *)malloc(sizeof(size_t) * n) ; + taxonomy.ConvertSeqLengthToTaxLength(seqLength, taxidLength) ; + for (i = 0 ; i < n ; ++i) + { + if (taxidLength[i] == 0)//|| i == taxonomy.GetRoot() ) + continue ; + fprintf(stdout, "%lu\t%lu\n", taxonomy.GetOrigTaxId(i), taxidLength[i]) ; + } + free(taxidLength) ; + } else if (inspectItem == ARGV_INSPECT_INDEXSIZE) { sprintf(buffer, "%s.1.cfr", idxPrefix) ; diff --git a/Makefile b/Makefile index 314fec3..54282ef 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ centrifuger-inspect: CentrifugerInspect.o CentrifugerBuild.o: CentrifugerBuild.cpp Builder.hpp ReadFiles.hpp Taxonomy.hpp defs.h compactds/*.hpp CentrifugerClass.o: CentrifugerClass.cpp Classifier.hpp ReadFiles.hpp Taxonomy.hpp defs.h ResultWriter.hpp ReadPairMerger.hpp ReadFormatter.hpp BarcodeCorrector.hpp BarcodeTranslator.hpp compactds/*.hpp -CentrifugerInspect: CentrifugerInspect.cpp Taxonomy.hpp compactds/*.hpp +CentrifugerInspect.o: CentrifugerInspect.cpp Taxonomy.hpp defs.h compactds/*.hpp + clean: rm -f *.o centrifuger-build centrifuger centrifuger-inspect diff --git a/MapID.hpp b/MapID.hpp index a642603..3fcac84 100644 --- a/MapID.hpp +++ b/MapID.hpp @@ -17,7 +17,7 @@ class MapID MapID() {} ~MapID() {} - // @return: mapped id + // @return: mapped id. Return the size_t Add(const T &elem) { if (_toNumId.find(elem) == _toNumId.end()) @@ -47,6 +47,11 @@ class MapID return _toOrigElem[nid] ; } + void GetElemList(std::vector &l) + { + l = _toOrigElem ; + } + size_t GetSize() { return _toOrigElem.size() ; diff --git a/Taxonomy.hpp b/Taxonomy.hpp index 84412e2..e219522 100644 --- a/Taxonomy.hpp +++ b/Taxonomy.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ #include "MapID.hpp" #include "compactds/SimpleVector.hpp" #include "compactds/Utils.hpp" +#include "compactds/Tree_Plain.hpp" using namespace compactds ; @@ -185,7 +187,8 @@ class Taxonomy // Flatten the taxonomy tree to the array _nodeCnt = _taxIdMap.GetSize() ; _taxonomyTree = new struct TaxonomyNode[_nodeCnt] ; - memset(_taxonomyTree, 0, sizeof(TaxonomyNode) * _nodeCnt) ; + // The set 0 should be handled by the constructor now. + //memset(_taxonomyTree, 0, sizeof(TaxonomyNode) * _nodeCnt) ; for (std::map::iterator it = cleanTree.begin() ; it != cleanTree.end() ; ++it) { @@ -196,10 +199,17 @@ class Taxonomy // We need to split the update parent node here because the order is random. for (uint64_t i = 0 ; i < _nodeCnt ; ++i) { - _taxonomyTree[i].parentTid = _taxIdMap.Map(_taxonomyTree[i].parentTid) ; - _taxonomyTree[ _taxonomyTree[i].parentTid ].leaf = false ; + if (_taxIdMap.IsIn(_taxonomyTree[i].parentTid)) + { + _taxonomyTree[i].parentTid = _taxIdMap.Map(_taxonomyTree[i].parentTid) ; + _taxonomyTree[ _taxonomyTree[i].parentTid ].leaf = false ; + } + else + { + Utils::PrintLog("WARNING: parent tax ID of %lu does not exist. Set its parent to itself.", GetOrigTaxId(i)) ; + _taxonomyTree[i].parentTid = i ; + } } - } void ReadTaxonomyName(std::string fname, std::map &presentTax) @@ -299,6 +309,40 @@ class Taxonomy } _seqCnt = _seqStrNameMap.GetSize() ; } + + // Whether b is next to a in accession id + bool IsNextSeqName(const char *a, const char *b) + { + int i, j ; + uint64_t id[2] ; + for (i = 0 ; i < 2 ; ++i) + { + const char *s = a ; + if (i == 1) + s = b ; + id[i] = 0 ; + for (j = 0 ; s[j] ; ++j) + if (s[j] >= '0' && s[j] <= '9') + break ; + + //if (j > 2) // It's not something like GCFXXXX numbering + // return false ; + + for (; s[j] ; ++j) + { + if (s[j] >= '0' && s[j] <= '9') + { + id[i] = id[i] * 10 + s[j] - '0' ; + } + else + break ; + } + } + if (id[1] == id[0] + 1) + return true ; + else + return false ; + } void SaveString(FILE *fp, std::string &s) { @@ -502,11 +546,16 @@ class Taxonomy uint64_t GetOrigTaxId(size_t taxid) { if (taxid >= _nodeCnt) - return 0 ; + return _taxIdMap.Inverse(_rootCTaxId) ; else return _taxIdMap.Inverse(taxid) ; } + uint64_t GetRoot() + { + return _rootCTaxId ; + } + size_t CompactTaxId(uint64_t taxid) { if (_taxIdMap.IsIn(taxid)) @@ -514,6 +563,11 @@ class Taxonomy else return _nodeCnt ; } + + uint64_t GetParentTid(uint64_t ctid) + { + return _taxonomyTree[ctid].parentTid ; + } uint8_t GetTaxIdRank(size_t ctid) { @@ -570,6 +624,12 @@ class Taxonomy return _nodeCnt ; } + // Get the seq names + void GetSeqNames(std::vector &seqNames) + { + _seqStrNameMap.GetElemList(seqNames) ; + } + // Promote the tax id to higher level until number of taxids <= k, or reach LCA void ReduceTaxIds(const SimpleVector &taxIds, SimpleVector &promotedTaxIds, int k) { @@ -678,7 +738,111 @@ class Taxonomy return childrenTax.size() ; } + + // Convert the taxonomy tree structure to a general tree that supports children operations. + // also the converted tree make sure every non-root node has a parent + size_t ConvertToGeneralTree(Tree_Plain &tree) + { + size_t i ; + tree.SetRoot(_rootCTaxId) ; + tree.Init(_nodeCnt) ; + for (i = 0 ; i < _nodeCnt ; ++i) + { + if (i != GetParentTid(i)) + tree.AddEdge(i, GetParentTid(i)) ; + } + + // Connect the disjoint trees to the root + std::vector rootChildrenList = tree.GetChildren( tree.Root() ) ; + std::map rootChildrenMap ; + size_t rootChildrenListSize = rootChildrenList.size() ; + for (i = 0 ; i < rootChildrenListSize ; ++i) + rootChildrenMap[ rootChildrenList[i] ] = 1 ; + for (i = 0 ; i < _nodeCnt ; ++i) + if (tree.Parent(i) == tree.Root() && rootChildrenMap.find(i) == rootChildrenMap.end()) + tree.AddEdge(i, tree.Root()) ; + return _nodeCnt ; + } + + // Assume taxIdLength is allocated of size _nodeCnt + void ConvertSeqLengthToTaxLength(std::map seqLength, size_t *taxidLength) + { + size_t i, j ; + std::vector seqNames ; + GetSeqNames(seqNames) ; + + size_t *taxidCount = (size_t *)calloc(_nodeCnt, sizeof(size_t)) ;// The number of genomes under a tax id. The tax Ids with sequence will be initalized to 1. + size_t *taxidNewLength = (size_t *)calloc(_nodeCnt, sizeof(size_t)) ; // the new length of a taxonomy ID + bool *hasSequence = (bool *)calloc(_nodeCnt, sizeof(bool)) ; // whether the tax id has some sequence/genome + + std::sort(seqNames.begin(), seqNames.end()) ; + + for (i = 0 ; i < _nodeCnt ; ++i) + taxidLength[i] = 0 ; + + size_t seqNameCnt = seqNames.size() ; + for (i = 0 ; i < seqNameCnt ; ) + { + size_t seqId = SeqNameToId(seqNames[i]) ; + size_t len = seqLength[seqId] ; + uint64_t taxid = SeqIdToTaxId(seqId) ; + for (j = i + 1 ; j < seqNameCnt ; ++j) + { + size_t nextSeqId = SeqNameToId(seqNames[j]) ; + if (SeqIdToTaxId(nextSeqId) != taxid + || !IsNextSeqName(seqNames[j - 1].c_str(), + seqNames[j].c_str())) + break ; + len += seqLength[nextSeqId] ; + } + + if (taxid < _nodeCnt) + { + if (len > taxidLength[taxid]) + taxidLength[taxid] = len ; + taxidCount[taxid] = 1 ; + hasSequence[taxid] = true ; + } + //else // ignore the case the sequence is not in the tree. + // taxidLength[taxid] += len ; + + i = j ; + } + + // Infer the average genome size for intermediate tax IDs + // If this becomes inefficient in future, consider using a tree DP + for (i = 0 ; i < _nodeCnt ; ++i) + { + if (!hasSequence[i]) + continue ; + if (i == _taxonomyTree[i].parentTid) + continue ; + + size_t p = _taxonomyTree[i].parentTid ; + while (1) + { + ++taxidCount[p] ; + taxidNewLength[p] += taxidLength[i] ; + if (p == _taxonomyTree[p].parentTid) + break ; + p = _taxonomyTree[p].parentTid ; + } + } + + for (i = 0 ; i < _nodeCnt ; ++i) + { + size_t sum = taxidNewLength[i] ; + if (hasSequence[i]) + sum += taxidLength[i] ; + taxidLength[i] = sum / taxidCount[i] ; + } + + free(taxidCount) ; + free(taxidNewLength) ; + free(hasSequence) ; + } + void Save(FILE *fp) { SAVE_VAR(fp, _nodeCnt) ; diff --git a/argvdefs.h b/argvdefs.h index 04834cf..166cfef 100644 --- a/argvdefs.h +++ b/argvdefs.h @@ -22,6 +22,7 @@ enum ARGV_BARCODE_TRANSLATE, ARGV_INSPECT_SUMMARY, ARGV_INSPECT_SEQNAME, + ARGV_INSPECT_SIZE_TABLE, ARGV_INSPECT_INDEXSIZE } ; diff --git a/compactds/Tree_Plain.hpp b/compactds/Tree_Plain.hpp index 793c12f..7b8f2d0 100644 --- a/compactds/Tree_Plain.hpp +++ b/compactds/Tree_Plain.hpp @@ -46,15 +46,25 @@ class Tree_Plain: public Tree { private: std::vector _nodes ; + size_t _root ; // allow flexible root setting. For compact representation, this public: - Tree_Plain() {} + Tree_Plain() + { + _root = 0 ; + } + ~Tree_Plain() {} + void SetRoot(size_t r) + { + _root = r ; + } + void Init() { _n = 1 ; - struct _plainTreeNode node(0, 0, 0, 0) ; + struct _plainTreeNode node(_root, _root, _root, _root) ; _nodes.push_back(node) ; } @@ -66,7 +76,7 @@ class Tree_Plain: public Tree _n = n ; for (i = 0 ; i < _n ; ++i) { - struct _plainTreeNode node(0, 0, 0, 0) ; + struct _plainTreeNode node(_root, _root, _root, _root) ; _nodes.push_back(node) ; } } @@ -81,10 +91,10 @@ class Tree_Plain: public Tree size_t AddNode(size_t parent) { size_t id = _nodes.size() ; - struct _plainTreeNode node(parent, 0, 0, 0) ; + struct _plainTreeNode node(parent, _root, _root, _root) ; size_t lastSibling = LastChild(parent) ; - if (lastSibling == 0) + if (lastSibling == _root) _nodes[parent].child = id ; else _nodes[lastSibling].sibling = id ; @@ -101,7 +111,7 @@ class Tree_Plain: public Tree _nodes[c].parent = parent ; size_t lastSibling = LastChild(parent) ; - if (lastSibling == 0) + if (lastSibling == _root) _nodes[parent].child = c ; else _nodes[lastSibling].sibling = c ; @@ -111,7 +121,7 @@ class Tree_Plain: public Tree size_t Root() const { - return 0 ; + return _root ; } // t-th(1-based) child ; @@ -140,7 +150,7 @@ class Tree_Plain: public Tree { size_t i ; size_t c = FirstChild(v) ; - for (i = 0 ; c != 0 ; ++i) + for (i = 0 ; c != _root ; ++i) c = NextSibling(c) ; return i ; } @@ -149,7 +159,7 @@ class Tree_Plain: public Tree { std::vector ret ; size_t c = _nodes[v].child ; - while (c != 0) + while (c != _root) { ret.push_back(c) ; c = _nodes[c].sibling ; @@ -190,7 +200,7 @@ class Tree_Plain: public Tree bool IsLeaf(size_t v) const { - if (_nodes[v].child == 0) + if (_nodes[v].child == _root) return true ; return false ; } @@ -219,8 +229,8 @@ class Tree_Plain: public Tree size_t ChildrenLabeled(size_t v, size_t l) const { size_t ret = 0 ; - int c = _nodes[v].child ; - while (c != 0) + size_t c = _nodes[v].child ; + while (c != _root) { if (_nodes[c].label == l) ++ret ; @@ -234,7 +244,7 @@ class Tree_Plain: public Tree { size_t cnt = 0 ; size_t c = _nodes[v].child ; - while (c != 0) + while (c != _root) { if (_nodes[c].label == l) ++cnt ; @@ -255,6 +265,7 @@ class Tree_Plain: public Tree { Tree::Save(fp) ; size_t i ; + SAVE_VAR(fp, _root) ; for (i = 0 ; i < _n ; ++i) _nodes[i].Save(fp) ; } @@ -264,6 +275,7 @@ class Tree_Plain: public Tree std::vector< struct _plainTreeNode >().swap(_nodes) ; Tree::Load(fp) ; + LOAD_VAR(fp, _root) ; size_t i ; struct _plainTreeNode node(0, 0, 0, 0) ; for (i = 0 ; i < _n ; ++i) diff --git a/defs.h b/defs.h index 3f8f471..f9ff27c 100644 --- a/defs.h +++ b/defs.h @@ -5,7 +5,7 @@ //#define DEBUG -#define CENTRIFUGER_VERSION "1.0.4-r153" +#define CENTRIFUGER_VERSION "1.0.5-r159" extern char nucToNum[26] ; extern char numToNuc[26] ;