Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --size-table option in centrifuger-inspect #12

Merged
merged 5 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CentrifugerInspect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ char usage[] = "./centrifuger-inspect [OPTIONS]:\n"
"\t--conversion-table: print the seqID to taxonomy ID translation information\n"
"\t--taxonomy-tree: print the taxonomy tree\n"
"\t--name-table: print the scientific name for each strain in the database\n"
"\t--size-table: print the lengths of the sequences belonging to the same taxonomic ID"
"\t--index-size: print the index information\n"
""
;
Expand All @@ -27,6 +28,7 @@ static struct option long_options[] = {
{"conversion-table", no_argument, 0, ARGV_CONVERSION_TABLE},
{"taxonomy-tree", no_argument, 0, ARGV_TAXONOMY_TREE},
{"name-table", no_argument, 0, ARGV_NAME_TABLE},
{"size-table", no_argument, 0, ARGV_INSPECT_SIZE_TABLE},
{"index-size", no_argument, 0, ARGV_INSPECT_INDEXSIZE},
{ (char *)0, 0, 0, 0}
} ;
Expand Down Expand Up @@ -106,6 +108,20 @@ int main(int argc, char *argv[])
{
taxonomy.PrintNameTable(stdout) ;
}
else if (inspectItem == ARGV_INSPECT_SIZE_TABLE)
{
size_t i ;
size_t n = taxonomy.GetNodeCount() ;
size_t *taxidLength = (size_t *)malloc(sizeof(size_t) * n) ;
taxonomy.ConvertSeqLengthToTaxLength(seqLength, taxidLength) ;
for (i = 0 ; i < n ; ++i)
{
if (taxidLength[i] == 0)//|| i == taxonomy.GetRoot() )
continue ;
fprintf(stdout, "%lu\t%lu\n", taxonomy.GetOrigTaxId(i), taxidLength[i]) ;
}
free(taxidLength) ;
}
else if (inspectItem == ARGV_INSPECT_INDEXSIZE)
{
sprintf(buffer, "%s.1.cfr", idxPrefix) ;
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ centrifuger-inspect: CentrifugerInspect.o

CentrifugerBuild.o: CentrifugerBuild.cpp Builder.hpp ReadFiles.hpp Taxonomy.hpp defs.h compactds/*.hpp
CentrifugerClass.o: CentrifugerClass.cpp Classifier.hpp ReadFiles.hpp Taxonomy.hpp defs.h ResultWriter.hpp ReadPairMerger.hpp ReadFormatter.hpp BarcodeCorrector.hpp BarcodeTranslator.hpp compactds/*.hpp
CentrifugerInspect: CentrifugerInspect.cpp Taxonomy.hpp compactds/*.hpp
CentrifugerInspect.o: CentrifugerInspect.cpp Taxonomy.hpp defs.h compactds/*.hpp

clean:
rm -f *.o centrifuger-build centrifuger centrifuger-inspect
7 changes: 6 additions & 1 deletion MapID.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class MapID
MapID() {}
~MapID() {}

// @return: mapped id
// @return: mapped id. Return the
size_t Add(const T &elem)
{
if (_toNumId.find(elem) == _toNumId.end())
Expand Down Expand Up @@ -47,6 +47,11 @@ class MapID
return _toOrigElem[nid] ;
}

void GetElemList(std::vector<T> &l)
{
l = _toOrigElem ;
}

size_t GetSize()
{
return _toOrigElem.size() ;
Expand Down
174 changes: 169 additions & 5 deletions Taxonomy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
#include <fstream>
#include <iostream>
#include <sstream>
#include <algorithm>

#include <stdio.h>
#include <string.h>

#include "MapID.hpp"
#include "compactds/SimpleVector.hpp"
#include "compactds/Utils.hpp"
#include "compactds/Tree_Plain.hpp"

using namespace compactds ;

Expand Down Expand Up @@ -185,7 +187,8 @@ class Taxonomy
// Flatten the taxonomy tree to the array
_nodeCnt = _taxIdMap.GetSize() ;
_taxonomyTree = new struct TaxonomyNode[_nodeCnt] ;
memset(_taxonomyTree, 0, sizeof(TaxonomyNode) * _nodeCnt) ;
// The set 0 should be handled by the constructor now.
//memset(_taxonomyTree, 0, sizeof(TaxonomyNode) * _nodeCnt) ;
for (std::map<uint64_t, struct TaxonomyNode>::iterator it = cleanTree.begin() ;
it != cleanTree.end() ; ++it)
{
Expand All @@ -196,10 +199,17 @@ class Taxonomy
// We need to split the update parent node here because the order is random.
for (uint64_t i = 0 ; i < _nodeCnt ; ++i)
{
_taxonomyTree[i].parentTid = _taxIdMap.Map(_taxonomyTree[i].parentTid) ;
_taxonomyTree[ _taxonomyTree[i].parentTid ].leaf = false ;
if (_taxIdMap.IsIn(_taxonomyTree[i].parentTid))
{
_taxonomyTree[i].parentTid = _taxIdMap.Map(_taxonomyTree[i].parentTid) ;
_taxonomyTree[ _taxonomyTree[i].parentTid ].leaf = false ;
}
else
{
Utils::PrintLog("WARNING: parent tax ID of %lu does not exist. Set its parent to itself.", GetOrigTaxId(i)) ;
_taxonomyTree[i].parentTid = i ;
}
}

}

void ReadTaxonomyName(std::string fname, std::map<uint64_t, int> &presentTax)
Expand Down Expand Up @@ -299,6 +309,40 @@ class Taxonomy
}
_seqCnt = _seqStrNameMap.GetSize() ;
}

// Whether b is next to a in accession id
bool IsNextSeqName(const char *a, const char *b)
{
int i, j ;
uint64_t id[2] ;
for (i = 0 ; i < 2 ; ++i)
{
const char *s = a ;
if (i == 1)
s = b ;
id[i] = 0 ;
for (j = 0 ; s[j] ; ++j)
if (s[j] >= '0' && s[j] <= '9')
break ;

//if (j > 2) // It's not something like GCFXXXX numbering
// return false ;

for (; s[j] ; ++j)
{
if (s[j] >= '0' && s[j] <= '9')
{
id[i] = id[i] * 10 + s[j] - '0' ;
}
else
break ;
}
}
if (id[1] == id[0] + 1)
return true ;
else
return false ;
}

void SaveString(FILE *fp, std::string &s)
{
Expand Down Expand Up @@ -502,18 +546,28 @@ class Taxonomy
uint64_t GetOrigTaxId(size_t taxid)
{
if (taxid >= _nodeCnt)
return 0 ;
return _taxIdMap.Inverse(_rootCTaxId) ;
else
return _taxIdMap.Inverse(taxid) ;
}

uint64_t GetRoot()
{
return _rootCTaxId ;
}

size_t CompactTaxId(uint64_t taxid)
{
if (_taxIdMap.IsIn(taxid))
return _taxIdMap.Map(taxid) ;
else
return _nodeCnt ;
}

uint64_t GetParentTid(uint64_t ctid)
{
return _taxonomyTree[ctid].parentTid ;
}

uint8_t GetTaxIdRank(size_t ctid)
{
Expand Down Expand Up @@ -570,6 +624,12 @@ class Taxonomy
return _nodeCnt ;
}

// Get the seq names
void GetSeqNames(std::vector<std::string> &seqNames)
{
_seqStrNameMap.GetElemList(seqNames) ;
}

// Promote the tax id to higher level until number of taxids <= k, or reach LCA
void ReduceTaxIds(const SimpleVector<size_t> &taxIds, SimpleVector<size_t> &promotedTaxIds, int k)
{
Expand Down Expand Up @@ -678,7 +738,111 @@ class Taxonomy

return childrenTax.size() ;
}

// Convert the taxonomy tree structure to a general tree that supports children operations.
// also the converted tree make sure every non-root node has a parent
size_t ConvertToGeneralTree(Tree_Plain &tree)
{
size_t i ;
tree.SetRoot(_rootCTaxId) ;
tree.Init(_nodeCnt) ;
for (i = 0 ; i < _nodeCnt ; ++i)
{
if (i != GetParentTid(i))
tree.AddEdge(i, GetParentTid(i)) ;
}

// Connect the disjoint trees to the root
std::vector<size_t> rootChildrenList = tree.GetChildren( tree.Root() ) ;
std::map<size_t, int> rootChildrenMap ;
size_t rootChildrenListSize = rootChildrenList.size() ;
for (i = 0 ; i < rootChildrenListSize ; ++i)
rootChildrenMap[ rootChildrenList[i] ] = 1 ;
for (i = 0 ; i < _nodeCnt ; ++i)
if (tree.Parent(i) == tree.Root() && rootChildrenMap.find(i) == rootChildrenMap.end())
tree.AddEdge(i, tree.Root()) ;

return _nodeCnt ;
}

// Assume taxIdLength is allocated of size _nodeCnt
void ConvertSeqLengthToTaxLength(std::map<size_t, size_t> seqLength, size_t *taxidLength)
{
size_t i, j ;
std::vector<std::string> seqNames ;
GetSeqNames(seqNames) ;

size_t *taxidCount = (size_t *)calloc(_nodeCnt, sizeof(size_t)) ;// The number of genomes under a tax id. The tax Ids with sequence will be initalized to 1.
size_t *taxidNewLength = (size_t *)calloc(_nodeCnt, sizeof(size_t)) ; // the new length of a taxonomy ID
bool *hasSequence = (bool *)calloc(_nodeCnt, sizeof(bool)) ; // whether the tax id has some sequence/genome

std::sort(seqNames.begin(), seqNames.end()) ;

for (i = 0 ; i < _nodeCnt ; ++i)
taxidLength[i] = 0 ;

size_t seqNameCnt = seqNames.size() ;
for (i = 0 ; i < seqNameCnt ; )
{
size_t seqId = SeqNameToId(seqNames[i]) ;
size_t len = seqLength[seqId] ;
uint64_t taxid = SeqIdToTaxId(seqId) ;
for (j = i + 1 ; j < seqNameCnt ; ++j)
{
size_t nextSeqId = SeqNameToId(seqNames[j]) ;
if (SeqIdToTaxId(nextSeqId) != taxid
|| !IsNextSeqName(seqNames[j - 1].c_str(),
seqNames[j].c_str()))
break ;
len += seqLength[nextSeqId] ;
}

if (taxid < _nodeCnt)
{
if (len > taxidLength[taxid])
taxidLength[taxid] = len ;
taxidCount[taxid] = 1 ;
hasSequence[taxid] = true ;
}
//else // ignore the case the sequence is not in the tree.
// taxidLength[taxid] += len ;

i = j ;
}

// Infer the average genome size for intermediate tax IDs
// If this becomes inefficient in future, consider using a tree DP
for (i = 0 ; i < _nodeCnt ; ++i)
{
if (!hasSequence[i])
continue ;
if (i == _taxonomyTree[i].parentTid)
continue ;

size_t p = _taxonomyTree[i].parentTid ;
while (1)
{
++taxidCount[p] ;
taxidNewLength[p] += taxidLength[i] ;
if (p == _taxonomyTree[p].parentTid)
break ;
p = _taxonomyTree[p].parentTid ;
}
}

for (i = 0 ; i < _nodeCnt ; ++i)
{
size_t sum = taxidNewLength[i] ;
if (hasSequence[i])
sum += taxidLength[i] ;
taxidLength[i] = sum / taxidCount[i] ;
}

free(taxidCount) ;
free(taxidNewLength) ;
free(hasSequence) ;
}

void Save(FILE *fp)
{
SAVE_VAR(fp, _nodeCnt) ;
Expand Down
1 change: 1 addition & 0 deletions argvdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ enum
ARGV_BARCODE_TRANSLATE,
ARGV_INSPECT_SUMMARY,
ARGV_INSPECT_SEQNAME,
ARGV_INSPECT_SIZE_TABLE,
ARGV_INSPECT_INDEXSIZE
} ;

Expand Down
Loading