Skip to content

Commit

Permalink
Merge pull request #12 from mourisl/centrifuger_inspect
Browse files Browse the repository at this point in the history
Add --size-table option in centrifuger-inspect
  • Loading branch information
mourisl authored Jul 11, 2024
2 parents cd471d9 + fce157d commit b8abab2
Show file tree
Hide file tree
Showing 7 changed files with 220 additions and 21 deletions.
16 changes: 16 additions & 0 deletions CentrifugerInspect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ char usage[] = "./centrifuger-inspect [OPTIONS]:\n"
"\t--conversion-table: print the seqID to taxonomy ID translation information\n"
"\t--taxonomy-tree: print the taxonomy tree\n"
"\t--name-table: print the scientific name for each strain in the database\n"
"\t--size-table: print the lengths of the sequences belonging to the same taxonomic ID"
"\t--index-size: print the index information\n"
""
;
Expand All @@ -27,6 +28,7 @@ static struct option long_options[] = {
{"conversion-table", no_argument, 0, ARGV_CONVERSION_TABLE},
{"taxonomy-tree", no_argument, 0, ARGV_TAXONOMY_TREE},
{"name-table", no_argument, 0, ARGV_NAME_TABLE},
{"size-table", no_argument, 0, ARGV_INSPECT_SIZE_TABLE},
{"index-size", no_argument, 0, ARGV_INSPECT_INDEXSIZE},
{ (char *)0, 0, 0, 0}
} ;
Expand Down Expand Up @@ -106,6 +108,20 @@ int main(int argc, char *argv[])
{
taxonomy.PrintNameTable(stdout) ;
}
else if (inspectItem == ARGV_INSPECT_SIZE_TABLE)
{
size_t i ;
size_t n = taxonomy.GetNodeCount() ;
size_t *taxidLength = (size_t *)malloc(sizeof(size_t) * n) ;
taxonomy.ConvertSeqLengthToTaxLength(seqLength, taxidLength) ;
for (i = 0 ; i < n ; ++i)
{
if (taxidLength[i] == 0)//|| i == taxonomy.GetRoot() )
continue ;
fprintf(stdout, "%lu\t%lu\n", taxonomy.GetOrigTaxId(i), taxidLength[i]) ;
}
free(taxidLength) ;
}
else if (inspectItem == ARGV_INSPECT_INDEXSIZE)
{
sprintf(buffer, "%s.1.cfr", idxPrefix) ;
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ centrifuger-inspect: CentrifugerInspect.o

CentrifugerBuild.o: CentrifugerBuild.cpp Builder.hpp ReadFiles.hpp Taxonomy.hpp defs.h compactds/*.hpp
CentrifugerClass.o: CentrifugerClass.cpp Classifier.hpp ReadFiles.hpp Taxonomy.hpp defs.h ResultWriter.hpp ReadPairMerger.hpp ReadFormatter.hpp BarcodeCorrector.hpp BarcodeTranslator.hpp compactds/*.hpp
CentrifugerInspect: CentrifugerInspect.cpp Taxonomy.hpp compactds/*.hpp
CentrifugerInspect.o: CentrifugerInspect.cpp Taxonomy.hpp defs.h compactds/*.hpp

clean:
rm -f *.o centrifuger-build centrifuger centrifuger-inspect
7 changes: 6 additions & 1 deletion MapID.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class MapID
MapID() {}
~MapID() {}

// @return: mapped id
// @return: mapped id. Return the
size_t Add(const T &elem)
{
if (_toNumId.find(elem) == _toNumId.end())
Expand Down Expand Up @@ -47,6 +47,11 @@ class MapID
return _toOrigElem[nid] ;
}

void GetElemList(std::vector<T> &l)
{
l = _toOrigElem ;
}

size_t GetSize()
{
return _toOrigElem.size() ;
Expand Down
174 changes: 169 additions & 5 deletions Taxonomy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
#include <fstream>
#include <iostream>
#include <sstream>
#include <algorithm>

#include <stdio.h>
#include <string.h>

#include "MapID.hpp"
#include "compactds/SimpleVector.hpp"
#include "compactds/Utils.hpp"
#include "compactds/Tree_Plain.hpp"

using namespace compactds ;

Expand Down Expand Up @@ -185,7 +187,8 @@ class Taxonomy
// Flatten the taxonomy tree to the array
_nodeCnt = _taxIdMap.GetSize() ;
_taxonomyTree = new struct TaxonomyNode[_nodeCnt] ;
memset(_taxonomyTree, 0, sizeof(TaxonomyNode) * _nodeCnt) ;
// The set 0 should be handled by the constructor now.
//memset(_taxonomyTree, 0, sizeof(TaxonomyNode) * _nodeCnt) ;
for (std::map<uint64_t, struct TaxonomyNode>::iterator it = cleanTree.begin() ;
it != cleanTree.end() ; ++it)
{
Expand All @@ -196,10 +199,17 @@ class Taxonomy
// We need to split the update parent node here because the order is random.
for (uint64_t i = 0 ; i < _nodeCnt ; ++i)
{
_taxonomyTree[i].parentTid = _taxIdMap.Map(_taxonomyTree[i].parentTid) ;
_taxonomyTree[ _taxonomyTree[i].parentTid ].leaf = false ;
if (_taxIdMap.IsIn(_taxonomyTree[i].parentTid))
{
_taxonomyTree[i].parentTid = _taxIdMap.Map(_taxonomyTree[i].parentTid) ;
_taxonomyTree[ _taxonomyTree[i].parentTid ].leaf = false ;
}
else
{
Utils::PrintLog("WARNING: parent tax ID of %lu does not exist. Set its parent to itself.", GetOrigTaxId(i)) ;
_taxonomyTree[i].parentTid = i ;
}
}

}

void ReadTaxonomyName(std::string fname, std::map<uint64_t, int> &presentTax)
Expand Down Expand Up @@ -299,6 +309,40 @@ class Taxonomy
}
_seqCnt = _seqStrNameMap.GetSize() ;
}

// Whether b is next to a in accession id
bool IsNextSeqName(const char *a, const char *b)
{
int i, j ;
uint64_t id[2] ;
for (i = 0 ; i < 2 ; ++i)
{
const char *s = a ;
if (i == 1)
s = b ;
id[i] = 0 ;
for (j = 0 ; s[j] ; ++j)
if (s[j] >= '0' && s[j] <= '9')
break ;

//if (j > 2) // It's not something like GCFXXXX numbering
// return false ;

for (; s[j] ; ++j)
{
if (s[j] >= '0' && s[j] <= '9')
{
id[i] = id[i] * 10 + s[j] - '0' ;
}
else
break ;
}
}
if (id[1] == id[0] + 1)
return true ;
else
return false ;
}

void SaveString(FILE *fp, std::string &s)
{
Expand Down Expand Up @@ -502,18 +546,28 @@ class Taxonomy
uint64_t GetOrigTaxId(size_t taxid)
{
if (taxid >= _nodeCnt)
return 0 ;
return _taxIdMap.Inverse(_rootCTaxId) ;
else
return _taxIdMap.Inverse(taxid) ;
}

uint64_t GetRoot()
{
return _rootCTaxId ;
}

size_t CompactTaxId(uint64_t taxid)
{
if (_taxIdMap.IsIn(taxid))
return _taxIdMap.Map(taxid) ;
else
return _nodeCnt ;
}

uint64_t GetParentTid(uint64_t ctid)
{
return _taxonomyTree[ctid].parentTid ;
}

uint8_t GetTaxIdRank(size_t ctid)
{
Expand Down Expand Up @@ -570,6 +624,12 @@ class Taxonomy
return _nodeCnt ;
}

// Get the seq names
void GetSeqNames(std::vector<std::string> &seqNames)
{
_seqStrNameMap.GetElemList(seqNames) ;
}

// Promote the tax id to higher level until number of taxids <= k, or reach LCA
void ReduceTaxIds(const SimpleVector<size_t> &taxIds, SimpleVector<size_t> &promotedTaxIds, int k)
{
Expand Down Expand Up @@ -678,7 +738,111 @@ class Taxonomy

return childrenTax.size() ;
}

// Convert the taxonomy tree structure to a general tree that supports children operations.
// also the converted tree make sure every non-root node has a parent
size_t ConvertToGeneralTree(Tree_Plain &tree)
{
size_t i ;
tree.SetRoot(_rootCTaxId) ;
tree.Init(_nodeCnt) ;
for (i = 0 ; i < _nodeCnt ; ++i)
{
if (i != GetParentTid(i))
tree.AddEdge(i, GetParentTid(i)) ;
}

// Connect the disjoint trees to the root
std::vector<size_t> rootChildrenList = tree.GetChildren( tree.Root() ) ;
std::map<size_t, int> rootChildrenMap ;
size_t rootChildrenListSize = rootChildrenList.size() ;
for (i = 0 ; i < rootChildrenListSize ; ++i)
rootChildrenMap[ rootChildrenList[i] ] = 1 ;
for (i = 0 ; i < _nodeCnt ; ++i)
if (tree.Parent(i) == tree.Root() && rootChildrenMap.find(i) == rootChildrenMap.end())
tree.AddEdge(i, tree.Root()) ;

return _nodeCnt ;
}

// Assume taxIdLength is allocated of size _nodeCnt
void ConvertSeqLengthToTaxLength(std::map<size_t, size_t> seqLength, size_t *taxidLength)
{
size_t i, j ;
std::vector<std::string> seqNames ;
GetSeqNames(seqNames) ;

size_t *taxidCount = (size_t *)calloc(_nodeCnt, sizeof(size_t)) ;// The number of genomes under a tax id. The tax Ids with sequence will be initalized to 1.
size_t *taxidNewLength = (size_t *)calloc(_nodeCnt, sizeof(size_t)) ; // the new length of a taxonomy ID
bool *hasSequence = (bool *)calloc(_nodeCnt, sizeof(bool)) ; // whether the tax id has some sequence/genome

std::sort(seqNames.begin(), seqNames.end()) ;

for (i = 0 ; i < _nodeCnt ; ++i)
taxidLength[i] = 0 ;

size_t seqNameCnt = seqNames.size() ;
for (i = 0 ; i < seqNameCnt ; )
{
size_t seqId = SeqNameToId(seqNames[i]) ;
size_t len = seqLength[seqId] ;
uint64_t taxid = SeqIdToTaxId(seqId) ;
for (j = i + 1 ; j < seqNameCnt ; ++j)
{
size_t nextSeqId = SeqNameToId(seqNames[j]) ;
if (SeqIdToTaxId(nextSeqId) != taxid
|| !IsNextSeqName(seqNames[j - 1].c_str(),
seqNames[j].c_str()))
break ;
len += seqLength[nextSeqId] ;
}

if (taxid < _nodeCnt)
{
if (len > taxidLength[taxid])
taxidLength[taxid] = len ;
taxidCount[taxid] = 1 ;
hasSequence[taxid] = true ;
}
//else // ignore the case the sequence is not in the tree.
// taxidLength[taxid] += len ;

i = j ;
}

// Infer the average genome size for intermediate tax IDs
// If this becomes inefficient in future, consider using a tree DP
for (i = 0 ; i < _nodeCnt ; ++i)
{
if (!hasSequence[i])
continue ;
if (i == _taxonomyTree[i].parentTid)
continue ;

size_t p = _taxonomyTree[i].parentTid ;
while (1)
{
++taxidCount[p] ;
taxidNewLength[p] += taxidLength[i] ;
if (p == _taxonomyTree[p].parentTid)
break ;
p = _taxonomyTree[p].parentTid ;
}
}

for (i = 0 ; i < _nodeCnt ; ++i)
{
size_t sum = taxidNewLength[i] ;
if (hasSequence[i])
sum += taxidLength[i] ;
taxidLength[i] = sum / taxidCount[i] ;
}

free(taxidCount) ;
free(taxidNewLength) ;
free(hasSequence) ;
}

void Save(FILE *fp)
{
SAVE_VAR(fp, _nodeCnt) ;
Expand Down
1 change: 1 addition & 0 deletions argvdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ enum
ARGV_BARCODE_TRANSLATE,
ARGV_INSPECT_SUMMARY,
ARGV_INSPECT_SEQNAME,
ARGV_INSPECT_SIZE_TABLE,
ARGV_INSPECT_INDEXSIZE
} ;

Expand Down
Loading

0 comments on commit b8abab2

Please sign in to comment.