diff --git a/CentrifugerClass.cpp b/CentrifugerClass.cpp index 0fbeb5d..7b46095 100644 --- a/CentrifugerClass.cpp +++ b/CentrifugerClass.cpp @@ -17,7 +17,7 @@ #include "BarcodeCorrector.hpp" #include "BarcodeTranslator.hpp" -char usage[] = "./centrifuger [OPTIONS]:\n" +char usage[] = "./centrifuger [OPTIONS] > output.tsv:\n" "Required:\n" "\t-x FILE: index prefix\n" "\t-1 FILE -2 FILE: paired-end read\n" @@ -27,6 +27,8 @@ char usage[] = "./centrifuger [OPTIONS]:\n" //"\t-o STRING: output prefix [centrifuger]\n" "\t-t INT: number of threads [1]\n" "\t-k INT: report upto distinct, primary assignments for each read pair [1]\n" + "\t--un STR: output unclassified reads to files with the prefix of \n" + "\t--cl STR: output classified reads to files with the prefix of \n" "\t--barcode STR: path to the barcode file\n" "\t--UMI STR: path to the UMI file\n" "\t--read-format STR: format for read, barcode and UMI files, e.g. r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI\n" @@ -40,6 +42,8 @@ char usage[] = "./centrifuger [OPTIONS]:\n" static const char *short_options = "x:1:2:u:o:t:k:v" ; static struct option long_options[] = { + { "un", required_argument, 0, ARGV_OUTPUT_UNCLASSIFIED}, + { "cl", required_argument, 0, ARGV_OUTPUT_CLASSIFIED}, { "min-hitlen", required_argument, 0, ARGV_MIN_HITLEN}, { "hitk-factor", required_argument, 0, ARGV_MAX_RESULT_PER_HIT_FACTOR}, { "merge-readpair", no_argument, 0, ARGV_MERGE_READ_PAIR }, @@ -240,6 +244,9 @@ int main(int argc, char *argv[]) ResultWriter resWriter ; ReadPairMerger readPairMerger ; bool mergeReadPair = false ; + + char unclassifiedOutputPrefix[1024] = ""; + char classifiedOutputPrefix[1024] = ""; // variables regarding barcode, UMI ReadFiles barcodeFile ; @@ -332,6 +339,14 @@ int main(int argc, char *argv[]) { barcodeTranslator.SetTranslateTable(optarg) ; } + else if (c == ARGV_OUTPUT_UNCLASSIFIED) + { + strcpy(unclassifiedOutputPrefix, optarg) ; + } + else if (c == ARGV_OUTPUT_CLASSIFIED) + { + strcpy(classifiedOutputPrefix, optarg) ; + } else { Utils::PrintLog("Unknown parameter found.\n%s", usage ) ; @@ -355,8 +370,17 @@ int main(int argc, char *argv[]) readFormatter.AllocateBuffers(4 * threadCnt) ; classifier.Init(idxPrefix, classifierParam) ; + resWriter.SetHasBarcode(hasBarcode) ; resWriter.SetHasUmi(hasUmi) ; + if (unclassifiedOutputPrefix[0] != '\0') + { + resWriter.SetOutputReads(unclassifiedOutputPrefix, hasMate, hasBarcode, hasUmi, reads, 0) ; + } + if (classifiedOutputPrefix[0] != '\0') + { + resWriter.SetOutputReads(classifiedOutputPrefix, hasMate, hasBarcode, hasUmi, reads, 1) ; + } resWriter.OutputHeader() ; const int maxBatchSize = 1024 * threadCnt ; @@ -427,7 +451,9 @@ int main(int argc, char *argv[]) pthread_join( threads[i], NULL ) ; for (i = 0 ; i < batchSize ; ++i) - resWriter.Output(readBatch[i].id, hasBarcode ? barcodeBatch[i].seq : NULL, + resWriter.Output(readBatch[i].id, readBatch[i].seq, readBatch[i].qual, + hasMate ? readBatch2[i].seq : NULL, hasMate ? readBatch2[i].qual : NULL, + hasBarcode ? barcodeBatch[i].seq : NULL, hasUmi ? umiBatch[i].seq : NULL, classifierBatchResults[i]) ; } @@ -534,7 +560,9 @@ int main(int argc, char *argv[]) pthread_join(threads[i], NULL) ; for (i = 0 ; i < batchSize[tag] ; ++i) - resWriter.Output(readBatch[tag][i].id, hasBarcode ? barcodeBatch[tag][i].seq : NULL, + resWriter.Output(readBatch[tag][i].id, readBatch[tag][i].seq, readBatch[tag][i].qual, + hasMate ? readBatch2[tag][i].seq : NULL, hasMate ? readBatch2[tag][i].qual : NULL, + hasBarcode ? barcodeBatch[tag][i].seq : NULL, hasUmi ? umiBatch[tag][i].seq : NULL, classifierBatchResults[tag][i]) ; started = true ; @@ -654,7 +682,10 @@ int main(int argc, char *argv[]) if (started) { for (i = 0 ; i < batchSize[prevTag] ; ++i) - resWriter.Output(readBatch[prevTag][i].id, hasBarcode ? barcodeBatch[prevTag][i].seq : NULL, + resWriter.Output(readBatch[prevTag][i].id, + readBatch[prevTag][i].seq, readBatch[prevTag][i].qual, + hasMate ? readBatch2[prevTag][i].seq : NULL, hasMate ? readBatch2[prevTag][i].qual : NULL, + hasBarcode ? barcodeBatch[prevTag][i].seq : NULL, hasUmi ? umiBatch[prevTag][i].seq : NULL, classifierBatchResults[prevTag][i]) ; } diff --git a/README.md b/README.md index 9a000a8..9321e30 100644 --- a/README.md +++ b/README.md @@ -49,16 +49,17 @@ An example of pre-built index containing human, bacteria, archea, and virus geno #### Classification - Usage: ./centrifuger [OPTIONS] + Usage: ./centrifuger [OPTIONS] > classification.tsv Required: -x FILE: index prefix -1 FILE -2 FILE: paired-end read or -u FILE: single-end read Optional: - -o STRING: output prefix [centrifuger] -t INT: number of threads [1] -k INT: report upto distinct, primary assignments for each read pair [1] + --un STR: output unclassified reads to files with the prefix of , e.g. _1/2.fq.gz + --cl STR: output classified reads to files with the prefix of --barcode STR: path to the barcode file --UMI STR: path to the UMI file --read-format STR: format for read, barcode and UMI files, e.g. r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI diff --git a/ReadFiles.hpp b/ReadFiles.hpp index 8ef5775..af715b3 100644 --- a/ReadFiles.hpp +++ b/ReadFiles.hpp @@ -93,6 +93,16 @@ class ReadFiles //gzrewind( gzFp[ fileCnt ]) ; //kseq_rewind( inSeq[ fileCnt] ) ; } + + void RemoveReadIdSuffix(char *id) + { + int len = strlen( id ) ; + if ( ( id[len - 1] == '1' || id[len - 1] == '2' ) + && id[len - 2] == '/' ) + { + id[len - 2] = '\0' ; + } + } public: char *id ; char *seq ; @@ -213,13 +223,7 @@ class ReadFiles free( qual ) ; id = strdup( inSeq->name.s ) ; - int len = strlen( id ) ; - if ( ( id[len - 1] == '1' || id[len - 1] == '2' ) - && id[len - 2] == '/' ) - { - id[len - 2] = '\0' ; - } - + RemoveReadIdSuffix(id) ; seq = strdup( inSeq->seq.s ) ; if ( inSeq->qual.l ) qual = strdup( inSeq->qual.s ) ; @@ -244,7 +248,7 @@ class ReadFiles if ( currentFpInd >= fileCnt ) return 0 ; - if ( *id != NULL ) + if ( *id != NULL ) free( *id ) ; if ( *seq != NULL ) free( *seq ) ; @@ -252,6 +256,7 @@ class ReadFiles free( *qual ) ; *id = strdup( inSeq->name.s ) ; + RemoveReadIdSuffix(*id) ; *seq = strdup( inSeq->seq.s ) ; /*if ( removeReturn ) { diff --git a/ResultWriter.hpp b/ResultWriter.hpp index 6996aa7..cf06fd1 100644 --- a/ResultWriter.hpp +++ b/ResultWriter.hpp @@ -6,6 +6,7 @@ #include "ReadFormatter.hpp" #include "BarcodeCorrector.hpp" #include "BarcodeTranslator.hpp" +#include "ReadFiles.hpp" class ResultWriter { @@ -13,6 +14,10 @@ class ResultWriter FILE *_fpClassification ; bool _hasBarcode ; bool _hasUmi ; + bool _outputUnclassified ; + bool _outputClassified ; + gzFile _gzFpUnclassified[4] ; + gzFile _gzFpClassified[4] ; void PrintExtraCol(const char *s) { @@ -26,14 +31,31 @@ class ResultWriter ResultWriter() { _fpClassification = stdout ; + _outputUnclassified = false ; + _outputClassified = false ; _hasBarcode = false ; _hasUmi = false ; + + int i ; + for (i = 0 ; i < 4 ; ++i) + { + _gzFpUnclassified[i] = NULL ; + _gzFpClassified[i] = NULL ; + } } ~ResultWriter() { if (_fpClassification != stdout) fclose(_fpClassification) ; + int i ; + for (i = 0 ; i < 4 ; ++i) + { + if (_gzFpUnclassified[i]) + gzclose(_gzFpUnclassified[i]) ; + if (_gzFpClassified[i]) + gzclose(_gzFpClassified[i]) ; + } } void SetClassificationOutput(const char *filename) @@ -41,6 +63,65 @@ class ResultWriter _fpClassification = fopen(filename, "w") ; } + // category: 0: unclassified reads, 1: classified reads + void SetOutputReads(const char *prefix, bool hasMate, bool hasBarcode, bool hasUmi, + ReadFiles &reads, int category) + { + int len = strlen(prefix) ; + char extension[10] = "" ; + char *name = (char *)malloc(sizeof(char) * (len + 1 + 10)) ; + + gzFile *gzFps = _gzFpUnclassified ; + if (category == 0) + _outputUnclassified = true ; + else + { + gzFps = _gzFpClassified ; + _outputClassified = true ; + } + + // Add "fa" or "fq" to the name + reads.Next() ; + extension[0] = '.' ; + extension[1] = 'f' ; + extension[2] = reads.qual == NULL ? 'a' : 'q' ; + reads.Rewind() ; + + // Add "gz" to the name + extension[3] = '.' ; + extension[4] = 'g' ; + extension[5] = 'z' ; + extension[6] = '\0' ; + + if (hasMate) + { + sprintf(name, "%s_1%s", prefix, extension) ; + gzFps[0] = gzopen(name, "w1") ; + + sprintf(name, "%s_2%s", prefix, extension) ; + gzFps[1] = gzopen(name, "w1") ; + } + else + { + sprintf(name, "%s%s", prefix, extension) ; + gzFps[0] = gzopen(name, "w1") ; + } + + extension[2] = 'a' ; // always 'fa' for barcode and umi + if (hasBarcode) + { + sprintf(name, "%s_bc%s", prefix, extension) ; + gzFps[2] = gzopen(name, "w1") ; + } + if (hasUmi) + { + sprintf(name, "%s_um%s", prefix, extension) ; + gzFps[3] = gzopen(name, "w1") ; + } + + free(name) ; + } + void SetHasBarcode(bool s) { _hasBarcode = s ; @@ -62,7 +143,9 @@ class ResultWriter fprintf(_fpClassification, "\n") ; } - void Output(const char *readid, const char *barcode, const char *umi, const struct _classifierResult &r) + void Output(const char *readid, + const char *seq1, const char *qual1, const char *seq2, const char *qual2, + const char *barcode, const char *umi, const struct _classifierResult &r) { int i ; int matchCnt = r.taxIds.size() ; @@ -91,6 +174,40 @@ class ResultWriter PrintExtraCol(umi) ; printf("\n") ; } + + for (i = 0 ; i <= 1 ; ++i) + { + gzFile *gzFps = NULL ; + if ( i == 0 && matchCnt == 0 && _outputUnclassified) + gzFps = _gzFpUnclassified ; + else if (i == 1 && matchCnt > 0 && _outputClassified) + gzFps = _gzFpClassified ; + else + continue ; + + if (qual1 == NULL) + gzprintf(gzFps[0], ">%s\n%s\n", readid, seq1) ; + else + gzprintf(gzFps[0], "@%s\n%s\n+\n%s\n", readid, seq1, qual1) ; + + if (seq2 != NULL) + { + if (qual2 == NULL) + gzprintf(gzFps[1], ">%s\n%s\n", readid, seq2) ; + else + gzprintf(gzFps[1], "@%s\n%s\n+\n%s\n", readid, seq2, qual2) ; + } + + if (_hasBarcode) + { + gzprintf(gzFps[2], ">%s\n%s\n", readid, barcode) ; + } + + if (_hasUmi) + { + gzprintf(gzFps[3], ">%s\n%s\n", readid, umi) ; + } + } } void Finalize() diff --git a/argvdefs.h b/argvdefs.h index 166cfef..8004e29 100644 --- a/argvdefs.h +++ b/argvdefs.h @@ -18,6 +18,8 @@ enum ARGV_READFORMAT, ARGV_BARCODE, ARGV_UMI, + ARGV_OUTPUT_UNCLASSIFIED, + ARGV_OUTPUT_CLASSIFIED, ARGV_BARCODE_WHITELIST, ARGV_BARCODE_TRANSLATE, ARGV_INSPECT_SUMMARY,