Skip to content

Commit

Permalink
Merge pull request #16 from mourisl/dev
Browse files Browse the repository at this point in the history
Add the --un,--cl options to output unclassified and classified reads
  • Loading branch information
mourisl authored Oct 3, 2024
2 parents ab8ce25 + baf67e8 commit 2fc1258
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 15 deletions.
39 changes: 35 additions & 4 deletions CentrifugerClass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "BarcodeCorrector.hpp"
#include "BarcodeTranslator.hpp"

char usage[] = "./centrifuger [OPTIONS]:\n"
char usage[] = "./centrifuger [OPTIONS] > output.tsv:\n"
"Required:\n"
"\t-x FILE: index prefix\n"
"\t-1 FILE -2 FILE: paired-end read\n"
Expand All @@ -27,6 +27,8 @@ char usage[] = "./centrifuger [OPTIONS]:\n"
//"\t-o STRING: output prefix [centrifuger]\n"
"\t-t INT: number of threads [1]\n"
"\t-k INT: report upto <int> distinct, primary assignments for each read pair [1]\n"
"\t--un STR: output unclassified reads to files with the prefix of <str>\n"
"\t--cl STR: output classified reads to files with the prefix of <str>\n"
"\t--barcode STR: path to the barcode file\n"
"\t--UMI STR: path to the UMI file\n"
"\t--read-format STR: format for read, barcode and UMI files, e.g. r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI\n"
Expand All @@ -40,6 +42,8 @@ char usage[] = "./centrifuger [OPTIONS]:\n"

static const char *short_options = "x:1:2:u:o:t:k:v" ;
static struct option long_options[] = {
{ "un", required_argument, 0, ARGV_OUTPUT_UNCLASSIFIED},
{ "cl", required_argument, 0, ARGV_OUTPUT_CLASSIFIED},
{ "min-hitlen", required_argument, 0, ARGV_MIN_HITLEN},
{ "hitk-factor", required_argument, 0, ARGV_MAX_RESULT_PER_HIT_FACTOR},
{ "merge-readpair", no_argument, 0, ARGV_MERGE_READ_PAIR },
Expand Down Expand Up @@ -240,6 +244,9 @@ int main(int argc, char *argv[])
ResultWriter resWriter ;
ReadPairMerger readPairMerger ;
bool mergeReadPair = false ;

char unclassifiedOutputPrefix[1024] = "";
char classifiedOutputPrefix[1024] = "";

// variables regarding barcode, UMI
ReadFiles barcodeFile ;
Expand Down Expand Up @@ -332,6 +339,14 @@ int main(int argc, char *argv[])
{
barcodeTranslator.SetTranslateTable(optarg) ;
}
else if (c == ARGV_OUTPUT_UNCLASSIFIED)
{
strcpy(unclassifiedOutputPrefix, optarg) ;
}
else if (c == ARGV_OUTPUT_CLASSIFIED)
{
strcpy(classifiedOutputPrefix, optarg) ;
}
else
{
Utils::PrintLog("Unknown parameter found.\n%s", usage ) ;
Expand All @@ -355,8 +370,17 @@ int main(int argc, char *argv[])
readFormatter.AllocateBuffers(4 * threadCnt) ;

classifier.Init(idxPrefix, classifierParam) ;

resWriter.SetHasBarcode(hasBarcode) ;
resWriter.SetHasUmi(hasUmi) ;
if (unclassifiedOutputPrefix[0] != '\0')
{
resWriter.SetOutputReads(unclassifiedOutputPrefix, hasMate, hasBarcode, hasUmi, reads, 0) ;
}
if (classifiedOutputPrefix[0] != '\0')
{
resWriter.SetOutputReads(classifiedOutputPrefix, hasMate, hasBarcode, hasUmi, reads, 1) ;
}
resWriter.OutputHeader() ;

const int maxBatchSize = 1024 * threadCnt ;
Expand Down Expand Up @@ -427,7 +451,9 @@ int main(int argc, char *argv[])
pthread_join( threads[i], NULL ) ;

for (i = 0 ; i < batchSize ; ++i)
resWriter.Output(readBatch[i].id, hasBarcode ? barcodeBatch[i].seq : NULL,
resWriter.Output(readBatch[i].id, readBatch[i].seq, readBatch[i].qual,
hasMate ? readBatch2[i].seq : NULL, hasMate ? readBatch2[i].qual : NULL,
hasBarcode ? barcodeBatch[i].seq : NULL,
hasUmi ? umiBatch[i].seq : NULL, classifierBatchResults[i]) ;
}

Expand Down Expand Up @@ -534,7 +560,9 @@ int main(int argc, char *argv[])
pthread_join(threads[i], NULL) ;

for (i = 0 ; i < batchSize[tag] ; ++i)
resWriter.Output(readBatch[tag][i].id, hasBarcode ? barcodeBatch[tag][i].seq : NULL,
resWriter.Output(readBatch[tag][i].id, readBatch[tag][i].seq, readBatch[tag][i].qual,
hasMate ? readBatch2[tag][i].seq : NULL, hasMate ? readBatch2[tag][i].qual : NULL,
hasBarcode ? barcodeBatch[tag][i].seq : NULL,
hasUmi ? umiBatch[tag][i].seq : NULL, classifierBatchResults[tag][i]) ;

started = true ;
Expand Down Expand Up @@ -654,7 +682,10 @@ int main(int argc, char *argv[])
if (started)
{
for (i = 0 ; i < batchSize[prevTag] ; ++i)
resWriter.Output(readBatch[prevTag][i].id, hasBarcode ? barcodeBatch[prevTag][i].seq : NULL,
resWriter.Output(readBatch[prevTag][i].id,
readBatch[prevTag][i].seq, readBatch[prevTag][i].qual,
hasMate ? readBatch2[prevTag][i].seq : NULL, hasMate ? readBatch2[prevTag][i].qual : NULL,
hasBarcode ? barcodeBatch[prevTag][i].seq : NULL,
hasUmi ? umiBatch[prevTag][i].seq : NULL, classifierBatchResults[prevTag][i]) ;
}

Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,17 @@ An example of pre-built index containing human, bacteria, archea, and virus geno

#### Classification

Usage: ./centrifuger [OPTIONS]
Usage: ./centrifuger [OPTIONS] > classification.tsv
Required:
-x FILE: index prefix
-1 FILE -2 FILE: paired-end read
or
-u FILE: single-end read
Optional:
-o STRING: output prefix [centrifuger]
-t INT: number of threads [1]
-k INT: report upto <int> distinct, primary assignments for each read pair [1]
--un STR: output unclassified reads to files with the prefix of <str>, e.g. <str>_1/2.fq.gz
--cl STR: output classified reads to files with the prefix of <str>
--barcode STR: path to the barcode file
--UMI STR: path to the UMI file
--read-format STR: format for read, barcode and UMI files, e.g. r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI
Expand Down
21 changes: 13 additions & 8 deletions ReadFiles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,16 @@ class ReadFiles
//gzrewind( gzFp[ fileCnt ]) ;
//kseq_rewind( inSeq[ fileCnt] ) ;
}

void RemoveReadIdSuffix(char *id)
{
int len = strlen( id ) ;
if ( ( id[len - 1] == '1' || id[len - 1] == '2' )
&& id[len - 2] == '/' )
{
id[len - 2] = '\0' ;
}
}
public:
char *id ;
char *seq ;
Expand Down Expand Up @@ -213,13 +223,7 @@ class ReadFiles
free( qual ) ;

id = strdup( inSeq->name.s ) ;
int len = strlen( id ) ;
if ( ( id[len - 1] == '1' || id[len - 1] == '2' )
&& id[len - 2] == '/' )
{
id[len - 2] = '\0' ;
}

RemoveReadIdSuffix(id) ;
seq = strdup( inSeq->seq.s ) ;
if ( inSeq->qual.l )
qual = strdup( inSeq->qual.s ) ;
Expand All @@ -244,14 +248,15 @@ class ReadFiles
if ( currentFpInd >= fileCnt )
return 0 ;

if ( *id != NULL )
if ( *id != NULL )
free( *id ) ;
if ( *seq != NULL )
free( *seq ) ;
if ( *qual != NULL )
free( *qual ) ;

*id = strdup( inSeq->name.s ) ;
RemoveReadIdSuffix(*id) ;
*seq = strdup( inSeq->seq.s ) ;
/*if ( removeReturn )
{
Expand Down
119 changes: 118 additions & 1 deletion ResultWriter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@
#include "ReadFormatter.hpp"
#include "BarcodeCorrector.hpp"
#include "BarcodeTranslator.hpp"
#include "ReadFiles.hpp"

class ResultWriter
{
private:
FILE *_fpClassification ;
bool _hasBarcode ;
bool _hasUmi ;
bool _outputUnclassified ;
bool _outputClassified ;
gzFile _gzFpUnclassified[4] ;
gzFile _gzFpClassified[4] ;

void PrintExtraCol(const char *s)
{
Expand All @@ -26,21 +31,97 @@ class ResultWriter
ResultWriter()
{
_fpClassification = stdout ;
_outputUnclassified = false ;
_outputClassified = false ;
_hasBarcode = false ;
_hasUmi = false ;

int i ;
for (i = 0 ; i < 4 ; ++i)
{
_gzFpUnclassified[i] = NULL ;
_gzFpClassified[i] = NULL ;
}
}

~ResultWriter()
{
if (_fpClassification != stdout)
fclose(_fpClassification) ;
int i ;
for (i = 0 ; i < 4 ; ++i)
{
if (_gzFpUnclassified[i])
gzclose(_gzFpUnclassified[i]) ;
if (_gzFpClassified[i])
gzclose(_gzFpClassified[i]) ;
}
}

void SetClassificationOutput(const char *filename)
{
_fpClassification = fopen(filename, "w") ;
}

// category: 0: unclassified reads, 1: classified reads
void SetOutputReads(const char *prefix, bool hasMate, bool hasBarcode, bool hasUmi,
ReadFiles &reads, int category)
{
int len = strlen(prefix) ;
char extension[10] = "" ;
char *name = (char *)malloc(sizeof(char) * (len + 1 + 10)) ;

gzFile *gzFps = _gzFpUnclassified ;
if (category == 0)
_outputUnclassified = true ;
else
{
gzFps = _gzFpClassified ;
_outputClassified = true ;
}

// Add "fa" or "fq" to the name
reads.Next() ;
extension[0] = '.' ;
extension[1] = 'f' ;
extension[2] = reads.qual == NULL ? 'a' : 'q' ;
reads.Rewind() ;

// Add "gz" to the name
extension[3] = '.' ;
extension[4] = 'g' ;
extension[5] = 'z' ;
extension[6] = '\0' ;

if (hasMate)
{
sprintf(name, "%s_1%s", prefix, extension) ;
gzFps[0] = gzopen(name, "w1") ;

sprintf(name, "%s_2%s", prefix, extension) ;
gzFps[1] = gzopen(name, "w1") ;
}
else
{
sprintf(name, "%s%s", prefix, extension) ;
gzFps[0] = gzopen(name, "w1") ;
}

extension[2] = 'a' ; // always 'fa' for barcode and umi
if (hasBarcode)
{
sprintf(name, "%s_bc%s", prefix, extension) ;
gzFps[2] = gzopen(name, "w1") ;
}
if (hasUmi)
{
sprintf(name, "%s_um%s", prefix, extension) ;
gzFps[3] = gzopen(name, "w1") ;
}

free(name) ;
}

void SetHasBarcode(bool s)
{
_hasBarcode = s ;
Expand All @@ -62,7 +143,9 @@ class ResultWriter
fprintf(_fpClassification, "\n") ;
}

void Output(const char *readid, const char *barcode, const char *umi, const struct _classifierResult &r)
void Output(const char *readid,
const char *seq1, const char *qual1, const char *seq2, const char *qual2,
const char *barcode, const char *umi, const struct _classifierResult &r)
{
int i ;
int matchCnt = r.taxIds.size() ;
Expand Down Expand Up @@ -91,6 +174,40 @@ class ResultWriter
PrintExtraCol(umi) ;
printf("\n") ;
}

for (i = 0 ; i <= 1 ; ++i)
{
gzFile *gzFps = NULL ;
if ( i == 0 && matchCnt == 0 && _outputUnclassified)
gzFps = _gzFpUnclassified ;
else if (i == 1 && matchCnt > 0 && _outputClassified)
gzFps = _gzFpClassified ;
else
continue ;

if (qual1 == NULL)
gzprintf(gzFps[0], ">%s\n%s\n", readid, seq1) ;
else
gzprintf(gzFps[0], "@%s\n%s\n+\n%s\n", readid, seq1, qual1) ;

if (seq2 != NULL)
{
if (qual2 == NULL)
gzprintf(gzFps[1], ">%s\n%s\n", readid, seq2) ;
else
gzprintf(gzFps[1], "@%s\n%s\n+\n%s\n", readid, seq2, qual2) ;
}

if (_hasBarcode)
{
gzprintf(gzFps[2], ">%s\n%s\n", readid, barcode) ;
}

if (_hasUmi)
{
gzprintf(gzFps[3], ">%s\n%s\n", readid, umi) ;
}
}
}

void Finalize()
Expand Down
2 changes: 2 additions & 0 deletions argvdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ enum
ARGV_READFORMAT,
ARGV_BARCODE,
ARGV_UMI,
ARGV_OUTPUT_UNCLASSIFIED,
ARGV_OUTPUT_CLASSIFIED,
ARGV_BARCODE_WHITELIST,
ARGV_BARCODE_TRANSLATE,
ARGV_INSPECT_SUMMARY,
Expand Down

0 comments on commit 2fc1258

Please sign in to comment.