From cc87c4d42fd05816b289bd9a6b87f778bdf61d06 Mon Sep 17 00:00:00 2001 From: Kyle Hazen <41054023+k1sauce@users.noreply.github.com> Date: Mon, 15 Jul 2024 00:16:19 -0700 Subject: [PATCH] Handle empty files (#5720) * fix: emit all fastqs, remove global log file * fix: remove rg parseing * fix(subworkflow/bcl_demultiplex): remove rg parsing, output empty fastqs, remove log file * test: update snaps * lint: traill\ing whitespace * feat: include read group parsing and empty file check in single closure * test: update snapshot * fix: lint final new line * fix: lint trailing whitespace * fix: add branch and emit empty fastq channel * tests: update snaps * lint: trailing whitespace * fix: filter bool and add test * test: update snaps * test: assert empty file exist * test: assert empty file exist * test: check file exists, update snap * Update test.yml * Update test.yml * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * ci: debug nf-test failure * lint: fix lint --------- Co-authored-by: khazen@clearnotehealth.com <--unset> Co-authored-by: Simon Pearce <24893913+SPPearce@users.noreply.github.com> --- subworkflows/nf-core/bcl_demultiplex/main.nf | 168 +++++++----------- .../bcl_demultiplex/tests/main.nf.test | 13 +- .../bcl_demultiplex/tests/main.nf.test.snap | 127 +++++++++---- .../bcl_demultiplex/tests/nextflow.config | 3 +- tests/config/nf-test.config | 2 +- 5 files changed, 169 insertions(+), 144 deletions(-) diff --git a/subworkflows/nf-core/bcl_demultiplex/main.nf b/subworkflows/nf-core/bcl_demultiplex/main.nf index 07b94367fb0..3816f310f9c 100644 --- a/subworkflows/nf-core/bcl_demultiplex/main.nf +++ b/subworkflows/nf-core/bcl_demultiplex/main.nf @@ -7,12 +7,9 @@ include { BCLCONVERT } from "../../../modules/nf-core/bclconvert/main" include { BCL2FASTQ } from "../../../modules/nf-core/bcl2fastq/main" -// Define the log file path before the workflow starts -def logFile = new File("${params.outdir}/invalid_fastqs.log") - workflow BCL_DEMULTIPLEX { take: - ch_flowcell // [[id:"", lane:""],samplesheet.csv, path/to/bcl/files] + ch_flowcell // [[id:"", lane:""], samplesheet.csv, path/to/bcl/files] demultiplexer // bclconvert or bcl2fastq main: @@ -67,106 +64,71 @@ workflow BCL_DEMULTIPLEX { } // Generate meta for each fastq - ch_fastq_with_meta = generate_fastq_meta(ch_fastq, logFile) - - emit: - fastq = ch_fastq_with_meta - reports = ch_reports - stats = ch_stats - interop = ch_interop - versions = ch_versions -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// This function appends a given text to a specified log file. -// If the log file does not exist, it creates a new one. -def appendToLogFile(String text, File logFile) { - if (!logFile.exists()) { - logFile.createNewFile() - } - // Convert the text to String if it's a GString - String textToWrite = text.toString() - logFile << textToWrite + "\n" // Appends the text to the file with a new line -} - -// Add meta values to fastq channel and skip invalid FASTQ files -def generate_fastq_meta(ch_reads, logFile) { - // Create a tuple with the meta.id and the fastq - ch_reads.transpose().map { fc_meta, fastq -> - // Check if the FASTQ file is empty or has invalid content - def isValid = fastq.withInputStream { is -> - new java.util.zip.GZIPInputStream(is).withReader('ASCII') { reader -> - def line = reader.readLine() - line != null && line.startsWith('@') - } - } - - def meta = null - if (isValid) { - meta = [ - "id": fastq.getSimpleName().toString() - ~/_R[0-9]_001.*$/, - "samplename": fastq.getSimpleName().toString() - ~/_S[0-9]+.*$/, - "readgroup": [:], - "fcid": fc_meta.id, - "lane": fc_meta.lane - ] - meta.readgroup = readgroup_from_fastq(fastq) - meta.readgroup.SM = meta.samplename - } else { - appendToLogFile( - "Empty or invalid FASTQ file: ${fastq}", - logFile - ) - fastq = null + ch_fastq_with_meta = ch_fastq + // reshapes the channel from a single emit of [meta, [fastq, fastq, fastq...]] + // to emits per fastq file like [meta, fastq] + .transpose() + .map { fc_meta, fastq -> + def meta = [:] + meta.id = fastq.getSimpleName().toString() - ~/_R[0-9]_001.*$/ + meta.samplename = fastq.getSimpleName().toString() - ~/_S[0-9]+.*$/ + meta.fcid = fc_meta.id + meta.lane = fc_meta.lane + // The buffered input stream allows reading directly from cloud storage + // It will not make a local copy of the file. + fastq.withInputStream { + InputStream gzipStream = new java.util.zip.GZIPInputStream(it) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + line = buffered.readLine() + buffered.close() } - - return [meta, fastq] - }.filter { it[0] != null } - // Group by meta.id for PE samples - .groupTuple(by: [0]) - // Add meta.single_end - .map { meta, fastq -> - if (meta != null) { - meta.single_end = fastq.size() == 1 + if ( line != null && line.startsWith('@') ) { + line = line.substring(1) + // expected format is like: + // xx:yy:FLOWCELLID:LANE:... (seven fields) + fields = line.split(':') + // CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm + // "@::::::: :::" + sequencer_serial = fields[0] + run_nubmer = fields[1] + fcid = fields[2] + lane = fields[3] + index = fields[-1] =~ /[GATC+-]/ ? fields[-1] : "" + ID = [fcid, lane].join(".") + PU = [fcid, lane, index].findAll().join(".") + PL = "ILLUMINA" + SM = fastq.getSimpleName().toString() - ~/_S[0-9]+.*$/ + meta.readgroup = [ + "ID": ID, + "SM": SM, + "PL": PL, + "PU": PU + ] + meta.empty = false + } else { + println "No reads were found in FASTQ file: ${fastq}" + meta.readgroup = [:] + meta.empty = true } - return [meta, fastq.flatten()] - } -} - -// https://github.com/nf-core/sarek/blob/7ba61bde8e4f3b1932118993c766ed33b5da465e/workflows/sarek.nf#L1014-L1040 -def readgroup_from_fastq(path) { - // expected format: - // xx:yy:FLOWCELLID:LANE:... (seven fields) - - def line - - path.withInputStream { - InputStream gzipStream = new java.util.zip.GZIPInputStream(it) - Reader decoder = new InputStreamReader(gzipStream, 'ASCII') - BufferedReader buffered = new BufferedReader(decoder) - line = buffered.readLine() - } - assert line.startsWith('@') - line = line.substring(1) - def fields = line.split(':') - def rg = [:] - - // CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm - // "@::::::: :::" - sequencer_serial = fields[0] - run_nubmer = fields[1] - fcid = fields[2] - lane = fields[3] - index = fields[-1] =~ /[GATC+-]/ ? fields[-1] : "" - - rg.ID = [fcid,lane].join(".") - rg.PU = [fcid, lane, index].findAll().join(".") - rg.PL = "ILLUMINA" + return [meta, fastq] + } + // Group by the meta id so that we can find mate pairs if they exist + .groupTuple(by: [0]) + .map { meta, fastq -> + meta.single_end = fastq.size() == 1 + return [meta, fastq.flatten()] + } + .branch { + fastq : it[0].empty == false + empty_fastq : it[0].empty == true + } - return rg + emit: + fastq = ch_fastq_with_meta.fastq + empty_fastq = ch_fastq_with_meta.empty_fastq + reports = ch_reports + stats = ch_stats + interop = ch_interop + versions = ch_versions } diff --git a/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test b/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test index a41c4f67c13..028bd54097a 100644 --- a/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test +++ b/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test @@ -19,10 +19,10 @@ nextflow_workflow { workflow { """ input[0] = Channel.value([ - [id:'test', lane:1 ], - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell_samplesheet.csv", - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell.tar.gz" - ]) + [id:'HMTFYDRXX'], + "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/bcl/SampleSheet.csv", + "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/bcl/200624_A00834_0183_BHMTFYDRXX.tar.gz" + ]) input[1] = "bclconvert" """ } @@ -39,7 +39,8 @@ nextflow_workflow { workflow.out.interop.get(0).get(1).findAll { file(it).name != "IndexMetricsOut.bin" }, ).match() }, - { assert file(workflow.out.interop.get(0).get(1).find { file(it).name == "IndexMetricsOut.bin" }).exists() } + { assert file(workflow.out.interop.get(0).get(1).find { file(it).name == "IndexMetricsOut.bin" }).exists() }, + { assert file(workflow.out.empty_fastq.get(0).get(1).find { file(it).name == "SampleZ_S5_L001_R1_001.fastq.gz" }).exists() } ) } } @@ -54,7 +55,7 @@ nextflow_workflow { [id:'test', lane:1 ], "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell_samplesheet.csv", "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell.tar.gz" - ]) + ]) input[1] = "bcl2fastq" """ } diff --git a/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test.snap b/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test.snap index 993f58fd9ff..82f7907512d 100644 --- a/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test.snap +++ b/subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test.snap @@ -4,22 +4,21 @@ [ [ { - "id": "test", - "lane": 1 + "id": "HMTFYDRXX" }, [ - "Adapter_Cycle_Metrics.csv:md5,5a0c88793b4a0885fe3dda16609b576e", - "Adapter_Metrics.csv:md5,989240b8840b2169ac1061f952c90f6c", - "Demultiplex_Stats.csv:md5,93949a8cd96f907d83e0808c1ec2a04b", - "Demultiplex_Tile_Stats.csv:md5,83120160b0f22a1303fa1db31c19f6e9", - "IndexMetricsOut.bin:md5,9e688c58a5487b8eaf69c9e1005ad0bf", - "Index_Hopping_Counts.csv:md5,1059369e375fd8f8423c0f6c934be978", - "Quality_Metrics.csv:md5,6614accb1bb414fe312b17b81f5521f7", - "Quality_Tile_Metrics.csv:md5,cdc89fd2962bdd4a24f71e186112118a", - "RunInfo.xml:md5,03038959f4dd181c86bc97ae71fe270a", - "SampleSheet.csv:md5,dc0dffd39541dd6cc5b4801d768a8d2b", - "Top_Unknown_Barcodes.csv:md5,2e2faba761137f228e56bd3428453ccc", - "fastq_list.csv:md5,05bc84f51840f5754cfb8381b36f2cb0" + "Adapter_Cycle_Metrics.csv:md5,05fbe7b2b0acdd557d355b448aa88ace", + "Adapter_Metrics.csv:md5,0fa4ac708955417af9d18cec4955552f", + "Demultiplex_Stats.csv:md5,4a3f451faa098156623b55b0f2ff27ee", + "Demultiplex_Tile_Stats.csv:md5,8f6fb58990572c4aa19c0100d8351484", + "IndexMetricsOut.bin:md5,fb16c8a9873e5b5950ae5949126af76c", + "Index_Hopping_Counts.csv:md5,f59474d96afe8218c7590bb240b19690", + "Quality_Metrics.csv:md5,c4622066f85d93b1661c928a46cfc508", + "Quality_Tile_Metrics.csv:md5,e22bc5e2f147695150b02afcccb38c4f", + "RunInfo.xml:md5,f283cb4600235db9261ee1e319b1407e", + "SampleSheet.csv:md5,4113eabae23136cc819c7f15ac5b6aad", + "Top_Unknown_Barcodes.csv:md5,37dbc2860c640fc721820b0217ea0504", + "fastq_list.csv:md5,b2409de8a184e83554766cd4460240a4" ] ] ], @@ -31,18 +30,72 @@ { "id": "Sample1_S1_L001", "samplename": "Sample1", + "fcid": "HMTFYDRXX", "readgroup": { - "ID": "000000000-K9H97.1", - "PU": "000000000-K9H97.1", + "ID": "HMTFYDRXX.1", + "SM": "Sample1", "PL": "ILLUMINA", - "SM": "Sample1" + "PU": "HMTFYDRXX.1.GAACTGAGCG+TCGTGGAGCG" }, - "fcid": "test", - "lane": 1, + "empty": false, "single_end": true }, [ - "Sample1_S1_L001_R1_001.fastq.gz:md5,0675fb6365322eaafb33c0f8e862b54b" + "Sample1_S1_L001_R1_001.fastq.gz:md5,b5489d1964db8db5502eb742cc3ef3ec" + ] + ], + [ + { + "id": "Sample23_S3_L001", + "samplename": "Sample23", + "fcid": "HMTFYDRXX", + "readgroup": { + "ID": "HMTFYDRXX.1", + "SM": "Sample23", + "PL": "ILLUMINA", + "PU": "HMTFYDRXX.1.CGTCTCATAT+TATAGTAGCT" + }, + "empty": false, + "single_end": true + }, + [ + "Sample23_S3_L001_R1_001.fastq.gz:md5,767a1091320320b140288066e29bccc5" + ] + ], + [ + { + "id": "SampleA_S2_L001", + "samplename": "SampleA", + "fcid": "HMTFYDRXX", + "readgroup": { + "ID": "HMTFYDRXX.1", + "SM": "SampleA", + "PL": "ILLUMINA", + "PU": "HMTFYDRXX.1.AGGTCAGATA+CTACAAGATA" + }, + "empty": false, + "single_end": true + }, + [ + "SampleA_S2_L001_R1_001.fastq.gz:md5,7de2ea88133409f34563f40a0d8c9e55" + ] + ], + [ + { + "id": "sampletest_S4_L001", + "samplename": "sampletest", + "fcid": "HMTFYDRXX", + "readgroup": { + "ID": "HMTFYDRXX.1", + "SM": "sampletest", + "PL": "ILLUMINA", + "PU": "HMTFYDRXX.1.ATTCCATAAG+TGCCTGGTGG" + }, + "empty": false, + "single_end": true + }, + [ + "sampletest_S4_L001_R1_001.fastq.gz:md5,c16c7de1b7bffb5e4503f4d94c40f881" ] ] ], @@ -50,19 +103,28 @@ ], [ - "ControlMetricsOut.bin:md5,6d77b38d0793a6e1ce1e85706e488953", - "CorrectedIntMetricsOut.bin:md5,2bbf84d3be72734addaa2fe794711434", - "ErrorMetricsOut.bin:md5,38c88def138e9bb832539911affdb286", - "ExtractionMetricsOut.bin:md5,7497c3178837eea8f09350b5cd252e99", - "QMetricsOut.bin:md5,7e9f198d53ebdfbb699a5f94cf1ed51c", - "TileMetricsOut.bin:md5,83891751ec1c91a425a524b476b6ca3c" + "BasecallingMetricsOut.bin:md5,7fb651325cba614d497d376eaf43fef4", + "CorrectedIntMetricsOut.bin:md5,dc8d57282ba9ece9e5fc58a92aa2ac52", + "EmpiricalPhasingMetricsOut.bin:md5,1ef4631faf0a3a3beb31b10fc38a734d", + "EventMetricsOut.bin:md5,dee320ce29bdadde44589aa9439f53ab", + "ExtendedTileMetricsOut.bin:md5,f01d1a9cf8445adf719e652ad7304cf2", + "ExtractionMetricsOut.bin:md5,972f4082ad950baaf42a6d28517d28a8", + "FWHMGridMetricsOut.bin:md5,6e297bafcd845bfd0440d08e1bb27685", + "ImageMetricsOut.bin:md5,ac5d1f0a1f611c0c7c9dd8e6b9e701b1", + "OpticalModelMetricsOut.bin:md5,3eaea5fcf2d353950b1e720c73695ccb", + "PFGridMetricsOut.bin:md5,ae469858ee96ffafbcaf3afb814bdab2", + "QMetrics2030Out.bin:md5,438248760db58917b32f4eccc6c64c39", + "QMetricsByLaneOut.bin:md5,e8254cb4a27846710a2a143296be2d8f", + "QMetricsOut.bin:md5,8f6b83028a42be721200a598161ac5c6", + "RegistrationMetricsOut.bin:md5,b5ebd957aed067b6403d851ba2ce0139", + "TileMetricsOut.bin:md5,21388348d81fa9be326d30ef6d348464" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-07-10T11:37:10.291289677" + "timestamp": "2024-06-26T20:28:00.234964" }, "bcl2fastq": { "content": [ @@ -123,14 +185,15 @@ { "id": "Sample1_S1_L001", "samplename": "Sample1", + "fcid": "test", + "lane": 1, "readgroup": { "ID": "000000000-K9H97.1", - "PU": "000000000-K9H97.1", + "SM": "Sample1", "PL": "ILLUMINA", - "SM": "Sample1" + "PU": "000000000-K9H97.1" }, - "fcid": "test", - "lane": 1, + "empty": false, "single_end": true }, [ @@ -167,6 +230,6 @@ "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-05-07T09:01:39.665409003" + "timestamp": "2024-06-26T20:28:19.854819" } } \ No newline at end of file diff --git a/subworkflows/nf-core/bcl_demultiplex/tests/nextflow.config b/subworkflows/nf-core/bcl_demultiplex/tests/nextflow.config index 31977c9c2ec..dd8bfa2ada6 100644 --- a/subworkflows/nf-core/bcl_demultiplex/tests/nextflow.config +++ b/subworkflows/nf-core/bcl_demultiplex/tests/nextflow.config @@ -2,8 +2,7 @@ process { withName: BCLCONVERT { ext.args = {[ meta.lane ? "--bcl-only-lane ${meta.lane}" : "", - "--force", - "--first-tile-only true" + "--force" ].join(" ").trim()} } withName: BCL2FASTQ { diff --git a/tests/config/nf-test.config b/tests/config/nf-test.config index 2c4467c3acb..0bbe5e5b0c0 100644 --- a/tests/config/nf-test.config +++ b/tests/config/nf-test.config @@ -32,7 +32,7 @@ profiles { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } - docker_self_hosted{ + docker_self_hosted { docker.enabled = true docker.fixOwnership = true docker.runOptions = '--platform=linux/amd64'