diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9fc6104e6..b0c8f226f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,6 +29,7 @@ test-job: - export ASAN_OPTIONS="detect_leaks=0" - CGL_DEBUG=ultra make -j 8 - CACTUS_BINARIES_MODE=local SON_TRACE_DATASETS=$(pwd)/cactusTestData CACTUS_TEST_CHOICE=normal make test + - pip install -U newick - make -j 8 hal_test # rebuild without all the debug flags - make clean diff --git a/BIN-INSTALL.md b/BIN-INSTALL.md index 3a5dc11bc..d14f05106 100644 --- a/BIN-INSTALL.md +++ b/BIN-INSTALL.md @@ -6,18 +6,18 @@ pre-compile binary, static linked distribution. ## Extracting If you have not already extract the distribution and cd into the cactus directory: ``` -tar -xzf cactus-bin-v2.6.8.tar.gz -cd cactus-bin-v2.6.8 +tar -xzf cactus-bin-v2.6.13.tar.gz +cd cactus-bin-v2.6.13 ``` ## Setup To build a python virtualenv and activate, do the following steps. This requires Python version >= 3.7 (so Ubuntu 18.04 users should use `-p python3.8` below): ``` -virtualenv -p python3 venv-cactus-v2.6.8 -printf "export PATH=$(pwd)/bin:\$PATH\nexport PYTHONPATH=$(pwd)/lib:\$PYTHONPATH\n" >> venv-cactus-v2.6.8/bin/activate -source venv-cactus-v2.6.8/bin/activate -python3 -m pip install -U setuptools pip +virtualenv -p python3 venv-cactus-v2.6.13 +printf "export PATH=$(pwd)/bin:\$PATH\nexport PYTHONPATH=$(pwd)/lib:\$PYTHONPATH\n" >> venv-cactus-v2.6.13/bin/activate +source venv-cactus-v2.6.13/bin/activate +python3 -m pip install -U setuptools pip wheel python3 -m pip install -U . python3 -m pip install -U -r ./toil-requirement.txt ``` diff --git a/Dockerfile b/Dockerfile index 9c196e693..412f8fa52 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,6 +87,9 @@ RUN chmod 777 /opt/cactus/wrapper.sh # log the memory usage (with --realTimeLogging) for local commands ENV CACTUS_LOG_MEMORY 1 +# remember we're in a docker to help with error handling +ENV CACTUS_INSIDE_CONTAINER 1 + # UCSC convention is to work in /data RUN mkdir -p /data WORKDIR /data diff --git a/Makefile b/Makefile index fa7c2afc0..5b6fbc710 100644 --- a/Makefile +++ b/Makefile @@ -92,24 +92,6 @@ unitTests = \ #paffyTests \ # This is removed for now -# these are slow, but added to CI here since hal no longer has its own -halTests = \ - hal4dExtractTest \ - halAlignmentTreesTest \ - halBottomSegmentTest \ - halColumnIteratorTest \ - halGappedSegmentIteratorTest \ - halGenomeTest \ - halHdf5Tests \ - halLiftoverTests \ - halMafTests \ - halMappedSegmentTest \ - halMetaDataTest \ - halRearrangementTest \ - halSequenceTest \ - halTopSegmentTest \ - halValidateTest - # if running travis or gitlab, we want output to go to stdout/stderr so it can # be seen in the log file, as opposed to individual files, which are much # easier to read when running tests in parallel. @@ -133,7 +115,8 @@ testLogDir = ${testOutDir}/logs test: ${testModules:%=%_runtest} ${unitTests:%=%_run_unit_test} test_blast: ${testModules:%=%_runtest_blast} test_nonblast: ${testModules:%=%_runtest_nonblast} -hal_test: ${halTests:%=%_run_unit_test} +hal_test: + cd ${CWD}/submodules/hal && make test # run one test and save output %_runtest: ${versionPy} diff --git a/README.md b/README.md index 29b2393a8..7e3007eff 100644 --- a/README.md +++ b/README.md @@ -54,14 +54,14 @@ virtualenv -p python3 cactus_env echo "export PATH=$(pwd)/bin:\$PATH" >> cactus_env/bin/activate echo "export PYTHONPATH=$(pwd)/lib:\$PYTHONPATH" >> cactus_env/bin/activate source cactus_env/bin/activate -python3 -m pip install -U setuptools pip +python3 -m pip install -U setuptools pip wheel python3 -m pip install -U . python3 -m pip install -U -r ./toil-requirement.txt ``` If you have Docker installed, you can now run Cactus. All binaries, such as `lastz` and `cactus-consolidated` will be run via Docker. Singularity binaries can be used in place of docker binaries with the `--binariesMode singularity` flag. Note, you must use Singularity 2.3 - 2.6 or Singularity 3.1.0+. Singularity 3 versions below 3.1.0 are incompatible with cactus (see [issue #55](https://github.com/ComparativeGenomicsToolkit/cactus/issues/55) and [issue #60](https://github.com/ComparativeGenomicsToolkit/cactus/issues/60)). -By default, cactus will use the image, `quay.io/comparative-genomics-toolkit/cactus:` when running binaries. This is usually okay, but can be overridden with the `CACTUS_DOCKER_ORG` and `CACTUS_DOCKER_TAG` environment variables. For example, to use GPU release 2.4.4, run `export CACTUS_DOCKER_TAG=v2.4.4-gpu` before running cactus. +By default, cactus will use the image corresponding to the latest release when running docker binaries. This is usually okay, but can be overridden with the `CACTUS_DOCKER_ORG` and `CACTUS_DOCKER_TAG` environment variables. For example, to use GPU release 2.4.4, run `export CACTUS_DOCKER_TAG=v2.4.4-gpu` before running cactus. ### Compiling Binaries Locally In order to compile the binaries locally and not use a Docker image, you need some dependencies installed. On Ubuntu (we've tested on 20.04 and 22.04), you can look at the [Cactus Dockerfile](./Dockerfile) for guidance. To obtain the `apt-get` command: diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 950644411..4f92b5dbf 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,3 +1,51 @@ +# Release 2.6.13 2023-11-15 + +This release fixes an issue where Toil can ask for way too much memory for minigraph construction +- Cut default minigraph construction memory estimate by half +- Add `--mgMemory` option to override minigraph construction memory estimate no matter what +- Exit with a clear error message (instead of more cryptic crash) when user tries to run container binaries in a container +- Fix double Toil delete that seems to cause fatal error in some environments +- Fix `gfaffix` regular expression bug that could cause paths other than the reference to be protoected from collapse. + +# Release 2.6.12 2023-11-07 + +The release contains fixes some recent regressions: + +- Include more portable (at least on Ubuntu) `gfaffix` binary. +- Fix error where gpu support on singularity is completely broken. +- Fix `export_hal` and `export_vg` job memory estimates when `--consMemory` not provided. + +# Release 2.6.11 2023-10-31 + +This release fixes a bug introduced in v2.6.10 that prevents diploid samples from working with `cactus-pangenome` + +- Remove stray `assert False` from diploid mash distance that was accidentally included in previous release + +# Release 2.6.10 2023-10-30 + +This release contains bug fixes for MAF export and the pangenome pipeline + +- Patch `taffy` to fix bug where sometimes length fields in output MAF can be wrong when using `cactus-hal2maf --filterGapCausingDupes` +- Fix regression `cactus-graphmap-split / cactus-pangenome` so that small poorly aligned reference contigs (like some tiny unplaced GRCh38 contigs) do not get unintentionally filtered out. These contigs do not help the graph in any way, but the tool should do what it says and make a component for every single reference contig no matter what, which it is now fixed to do. +- Cactus will now terminate with a clear error message if any other `--batchSystem` than `single_machine` is attempted from *inside* a docker container. +- Mash distance order into `minigraph` construction fixed so that haplotypes from the same sample are always added contiguously in the presence of ties. +- CI fixed to run all `hal` tests, and not just a subset. +- `pip install wheel` added to installation instructions, as apparently that's needed to install Cactus with some (newer?) Pythons. + +# Release 2.6.9 2023-10-20 + +This release contains some bug fixes and changes to docker image uploads + +- GFAffix updated to latest release +- CI no longer pushes a docker image to quay.io for every single commit. +- CPU docker release now made locally as done for GPU +- `--binariesMode docker` will automatically point to release image (using GPU one as appropriate) +- `--consMemory` overrides `hal2vg` memory as well +- `--defaulMemory` defaults to `4Gi` when using docker binaries +- SegAlign job memory specification increased to something more realistic +- `--lastzMemory` option added to override SegAlign memory -- highly recommended on SLURM +- chromosome (.vg / .og) outputs from pangenome pipeline will have ref paths of form `GRCh38#0#chr1` instead of `GRCh38#chr1` to be more consistent with full-genome indexes (and PanSN in general) + # Release 2.6.8 2023-09-28 This release includes several bug fixes for the pangenome pipeline diff --git a/api/tests/cactusParamsTest.c b/api/tests/cactusParamsTest.c index 938363a85..32e5821e3 100644 --- a/api/tests/cactusParamsTest.c +++ b/api/tests/cactusParamsTest.c @@ -27,7 +27,7 @@ static void testCactusParams(CuTest *testCase) { CuAssertTrue(testCase, length >= 3); CuAssertIntEquals(testCase, l[0], 2); CuAssertIntEquals(testCase, l[1], 32); - CuAssertIntEquals(testCase, l[2], 512); + CuAssertIntEquals(testCase, l[2], 256); // Test moving the root of the search cactusParams_set_root(p, 1, "caf"); diff --git a/build-tools/downloadMafTools b/build-tools/downloadMafTools index 0d8a1a360..8cee67cd5 100755 --- a/build-tools/downloadMafTools +++ b/build-tools/downloadMafTools @@ -51,7 +51,7 @@ export HTSLIB_LIBS="$(pwd)/libhts.a -lbz2 -ldeflate -lm -lpthread -lz -llzma -pt cd ${mafBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/taffy.git cd taffy -git checkout c75ce895b7975e7ac17cb1ce964db3016615de47 +git checkout ee50639be3d86451590de8ea4d3a7a037eeaf427 git submodule update --init --recursive export HALDIR=${CWD}/submodules/hal make -j ${numcpu} @@ -66,7 +66,7 @@ fi cd ${mafBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/mafTools.git cd mafTools -git checkout 837b8f27c7bf781c7cbee3972b94e91aa6a77790 +git checkout b88cd313cb18764d87bc801fbbbb00f982c1a48f find . -name "*.mk" | xargs sed -ie "s/-Werror//g" find . -name "Makefile*" | xargs sed -ie "s/-Werror//g" # hack in flags support diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 4491ba113..5ebdd7753 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -156,7 +156,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout 9b26caa961d6e72ad3747e5c2ce81cdf1e9b63c3 +git checkout 0c17bc4ae9a7cf174fa40805cde7f8f1f6de8225 make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then @@ -279,7 +279,7 @@ fi # vg cd ${pangenomeBuildDir} #wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg -wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.9df2a056197cafd817cf48c76cf662dd775d265d -O vg +wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.98e3b7c867eb64178298535b076189ef7fda5031 -O vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then @@ -290,12 +290,12 @@ fi # gfaffix cd ${pangenomeBuildDir} -wget -q https://github.com/marschall-lab/GFAffix/releases/download/0.1.5/GFAffix-0.1.5_linux_x86_64.tar.gz -tar xzf GFAffix-0.1.5_linux_x86_64.tar.gz -chmod +x GFAffix-0.1.5_linux_x86_64/gfaffix -if [[ $STATIC_CHECK -ne 1 || $(ldd GFAffix-0.1.5_linux_x86_64/gfaffix | grep so | wc -l) -eq 0 ]] +wget -q https://github.com/marschall-lab/GFAffix/releases/download/0.1.5b/GFAffix-0.1.5b_linux_x86_64.tar.gz +tar xzf GFAffix-0.1.5b_linux_x86_64.tar.gz +chmod +x GFAffix-0.1.5b_linux_x86_64/gfaffix +if [[ $STATIC_CHECK -ne 1 || $(ldd GFAffix-0.1.5b_linux_x86_64/gfaffix | grep so | wc -l) -eq 0 ]] then - mv GFAffix-0.1.5_linux_x86_64/gfaffix ${binDir} + mv GFAffix-0.1.5b_linux_x86_64/gfaffix ${binDir} else exit 1 fi diff --git a/build-tools/makeCpuDockerRelease b/build-tools/makeCpuDockerRelease index e59e05892..57a2f01c3 100755 --- a/build-tools/makeCpuDockerRelease +++ b/build-tools/makeCpuDockerRelease @@ -24,10 +24,11 @@ git checkout "${REL_TAG}" git submodule update --init --recursive docker build . -f Dockerfile -t ${dockname}:${REL_TAG} +docker tag ${dockname}:${REL_TAG} ${dockname}:latest read -p "Are you sure you want to push ${dockname}:${REL_TAG} to quay?" yn case $yn in - [Yy]* ) docker push ${dockname}:${REL_TAG}; break;; + [Yy]* ) docker push ${dockname}:${REL_TAG} && docker push ${dockname}:latest ; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac diff --git a/doc/pangenome.md b/doc/pangenome.md index a78934687..3d7ab05a3 100644 --- a/doc/pangenome.md +++ b/doc/pangenome.md @@ -128,6 +128,7 @@ The Minigraph-Cactus pipeline is run via the `cactus-pangenome` command. It cons **Before running large jobs, it is important to consider the following options:** * `--mgCores` the number of cores for `minigraph` construction (default: all available) +* `--mgMemory` the amount of memory for `minigraph` construction. The default estimate can be quite conservative (ie high), so if it is too high for your system, you can lower it with this option (default: estimate based on input size). * `--mapCores` the number of cores for each `minigraph` mapping job (default: up to 6) * `--consCores` the number of cores for each `cactus-consolidated` job (default: all available) * `--consMemory` the amount of memory for each `cactus-consolidated` job. By default, it is estimated from the data but these estimates being wrong can be catastrophic on [SLURM](./progressive.md#running-on-a-cluster). Consider setting to the maximum memory you have available when running on a cluster to be extra safe (seems to be more of an issue for non-human data) diff --git a/doc/progressive.md b/doc/progressive.md index b43e32802..c418b5428 100644 --- a/doc/progressive.md +++ b/doc/progressive.md @@ -170,12 +170,12 @@ The Cactus Docker image contains everything you need to run Cactus (python envir ``` wget -q https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactus/master/examples/evolverMammals.txt -O evolverMammals.txt -docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.6.8 cactus /data/jobStore /data/evolverMammals.txt /data/evolverMammals.hal +docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.6.13 cactus /data/jobStore /data/evolverMammals.txt /data/evolverMammals.hal ``` Or you can proceed interactively by running ``` -docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.6.8 bash +docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.6.13 bash cactus /data/jobStore /data/evolverMammals.txt /data/evolverMammals.hal ``` @@ -204,6 +204,16 @@ export TOIL_SLURM_ARGS="--nice=5000" to avoid making too many enemies. +You can (and probably should) use the `--batchLogsDir` option in order to enable more SLURM logging. You must pass it a directory that already exists. Ex. + +``` +mkdir -p batch-logs +cactus ./js ./examples/evolverMammals.txt evolverMammals.hal --batchSystem slurm --batchLogsDir batch-logs +``` + +You'll want to clean out this directory after a successful run. + + You cannot run `cactus --batchSystem slurm` from *inside* the Cactus docker container, because the Cactus docker container doesn't contain slurm. Therefore in order to use slurm, you must be able to `pip install` Cactus inside a virtualenv on the head node. You can still use `--binariesMode docker` or `--binariesMode` singularity to run cactus *binaries* from a container, but the Cactus Python module needs to be installed locally. **IMPORTANT** @@ -211,7 +221,7 @@ You cannot run `cactus --batchSystem slurm` from *inside* the Cactus docker cont To run Progressive Cactus with CPU (default) lastz, you should increase the chunk size. This will divide the input assemblies into fewer pieces, resulting in fewer jobs on the cluster. ``` -cp cactus-bin-v2.6.8/src/cactus/cactus_progressive_config.xml ./config-slurm.xml +cp cactus-bin-v2.6.13/src/cactus/cactus_progressive_config.xml ./config-slurm.xml sed -i config-slurm.xml -e 's/blast chunkSize="30000000"/blast chunkSize="90000000"/g' sed -i config-slurm.xml -e 's/dechunkBatchSize="1000"/dechunkBatchSize="200"/g' ``` @@ -340,6 +350,11 @@ We've tested SegAlign on Nvidia V100 and A10G GPUs. See the Terra example above Please [cite SegAlign](https://doi.ieeecomputersociety.org/10.1109/SC41405.2020.00043). +### Using GPU Acceleration on a Cluster + +Since `SegAlign` is only released in the GPU-enabled docker image, that's the easiest way to run it. When running on a cluster, this usually means the best way to use it is with `--binariesMode docker --gpu `. This way cactus is installed locally on your virtual environment and can run slurm commands like `sbatch` (that aren't available in the Cactus container), but SegAlign itself will be run from inside Docker. + +**Important**: Consider using `--lastzMemory` when using GPU acceleration on a cluster. Like `--consMemory`, it lets you override the amount of memory Toil requests which can help with errors if Cactus's automatic estimate is either too low (cluster evicts the job) or too high (cluster cannot schedule the job). ## Pre-Alignment Checklist diff --git a/examples/evolverPrimates.txt b/examples/evolverPrimates.txt index 54495a980..9c8623807 100644 --- a/examples/evolverPrimates.txt +++ b/examples/evolverPrimates.txt @@ -1,4 +1,6 @@ -simHuman https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simHuman.chr6 +(simOrang:0.00993,((simChimp:0.00272,simHuman:0.00269)cb:0.00415,simGorilla:0.00644)hcb:0.00046); + +simOrang https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simOrang.chr6 simChimp https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simChimp.chr6 +simHuman https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simHuman.chr6 simGorilla https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simGorilla.chr6 -simOrang https://raw.githubusercontent.com/ComparativeGenomicsToolkit/cactusTestData/master/evolver/primates/loci1/simOrang.chr6 diff --git a/setup.py b/setup.py index bb1085b1a..d8c482a37 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def run(self): setup( name = "Cactus", - version = "2.6.8", + version = "2.6.13", author = "Benedict Paten", package_dir = {'': 'src'}, packages = find_packages(where='src'), diff --git a/src/cactus/blast/cactus_blast.py b/src/cactus/blast/cactus_blast.py index 96469d769..f4deadccb 100644 --- a/src/cactus/blast/cactus_blast.py +++ b/src/cactus/blast/cactus_blast.py @@ -19,6 +19,7 @@ from cactus.shared.common import enableDumpStack from cactus.shared.common import cactus_override_toil_options from cactus.shared.version import cactus_commit +from cactus.progressive.cactus_prepare import human2bytesN from cactus.paf.local_alignment import sanitize_then_make_paf_alignments @@ -60,8 +61,11 @@ def main(): parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--gpu", nargs='?', const='all', default=None, help="toggle on GPU-enabled lastz, and specify number of GPUs (all available if no value provided)") - parser.add_argument("--lastzCores", type=int, default=None, help="Number of cores for each lastz job, only relevant when running with --gpu") - + parser.add_argument("--lastzCores", type=int, default=None, help="Number of cores for each lastz/segalign job, only relevant when running with --gpu") + parser.add_argument("--lastzMemory", type=human2bytesN, + help="Memory in bytes for each lastz/segalign job (defaults to an estimate based on the input data size). " + "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) + options = parser.parse_args() setupBinaries(options) @@ -95,7 +99,7 @@ def runCactusBlastOnly(options): # load up the seqfile and figure out the outgroups and schedule config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) # apply gpu override config_wrapper.initGPU(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper) diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index a777118eb..58a6d8c33 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -57,6 +57,9 @@ + diff --git a/src/cactus/maf/cactus_hal2chains.py b/src/cactus/maf/cactus_hal2chains.py index 1c3c88840..a3a7c5fe7 100644 --- a/src/cactus/maf/cactus_hal2chains.py +++ b/src/cactus/maf/cactus_hal2chains.py @@ -102,7 +102,7 @@ def main(): #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) - config.substituteAllPredefinedConstantsWithLiterals() + config.substituteAllPredefinedConstantsWithLiterals(options) logger.info("Importing {}".format(options.halFile)) hal_id = toil.importFile(options.halFile) diff --git a/src/cactus/maf/cactus_hal2maf.py b/src/cactus/maf/cactus_hal2maf.py index 54b07dcdd..331f89c2b 100644 --- a/src/cactus/maf/cactus_hal2maf.py +++ b/src/cactus/maf/cactus_hal2maf.py @@ -179,7 +179,7 @@ def main(): #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) - config.substituteAllPredefinedConstantsWithLiterals() + config.substituteAllPredefinedConstantsWithLiterals(options) logger.info("Importing {}".format(options.halFile)) hal_id = toil.importFile(options.halFile) diff --git a/src/cactus/maf/cactus_maf2bigmaf.py b/src/cactus/maf/cactus_maf2bigmaf.py index 04b06dfd2..1677dce9d 100644 --- a/src/cactus/maf/cactus_maf2bigmaf.py +++ b/src/cactus/maf/cactus_maf2bigmaf.py @@ -84,7 +84,7 @@ def main(): #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) - config.substituteAllPredefinedConstantsWithLiterals() + config.substituteAllPredefinedConstantsWithLiterals(options) logger.info("Importing {}".format(options.mafFile)) maf_id = toil.importFile(options.mafFile) diff --git a/src/cactus/paf/local_alignment.py b/src/cactus/paf/local_alignment.py index da8716048..876cd9ebe 100755 --- a/src/cactus/paf/local_alignment.py +++ b/src/cactus/paf/local_alignment.py @@ -129,7 +129,8 @@ def combine_chunks(job, chunked_alignment_files, batch_size): outfile=alignment_file, outappend=True) job.fileStore.deleteGlobalFile(chunk) # Cleanup the old files return job.fileStore.writeGlobalFile(alignment_file) # Return the final alignments file copied to the jobstore - + + def merge_combined_chunks(job, combined_chunks): # mege up and return some chunks, deleting them too output_path = job.fileStore.getLocalTempFile() @@ -138,10 +139,10 @@ def merge_combined_chunks(job, combined_chunks): chunk_path = job.fileStore.readGlobalFile(chunk, mutable=True) with open(chunk_path, 'r') as chunk_file: shutil.copyfileobj(chunk_file, output_file) - os.remove(chunk_path) job.fileStore.deleteGlobalFile(chunk) return job.fileStore.writeGlobalFile(output_path) + def make_chunked_alignments(job, event_a, genome_a, event_b, genome_b, distance, params): lastz_params_node = params.find("blast") gpu = getOptionalAttrib(lastz_params_node, 'gpu', typeFn=int, default=0) @@ -149,6 +150,7 @@ def make_chunked_alignments(job, event_a, genome_a, event_b, genome_b, distance, # wga-gpu has a 6G limit, so we always override lastz_params_node.attrib['chunkSize'] = '6000000000' lastz_cores = getOptionalAttrib(lastz_params_node, 'cpu', typeFn=int, default=None) + lastz_memory = getOptionalAttrib(lastz_params_node, 'lastz_memory', typeFn=int, default=None) def make_chunks(genome): output_chunks_dir = job.fileStore.getLocalTempDir() @@ -171,16 +173,37 @@ def make_chunks(genome): for j, chunk_b in enumerate(chunks_b): mappers = { "lastz":run_lastz, "minimap2":run_minimap2} mappingFn = mappers[params.find("blast").attrib["mapper"]] + memory = lastz_memory if lastz_memory else max(200000000, 15*(chunk_a.size+chunk_b.size)) chunked_alignment_files.append(job.addChildJobFn(mappingFn, '{}_{}'.format(event_a, i), chunk_a, '{}_{}'.format(event_b, j), chunk_b, distance, params, - cores=lastz_cores, disk=4*(chunk_a.size+chunk_b.size), - memory=max(200000000, 15*(chunk_a.size+chunk_b.size)), + cores=lastz_cores, + disk=max(4*(chunk_a.size+chunk_b.size), memory), + memory=memory, accelerators=accelerators).rv()) dechunk_batch_size = getOptionalAttrib(lastz_params_node, 'dechunkBatchSize', typeFn=int, default=1e9) return job.addFollowOnJobFn(combine_chunks, chunked_alignment_files, dechunk_batch_size).rv() # Combine the chunked alignment files +def invert_alignments(job, alignment_file): + """ Invert the pafs in the alignment_file """ + alignment_file_local = job.fileStore.readGlobalFile(alignment_file) + inverted_alignment_file_local = job.fileStore.getLocalTempFile() # Get a temporary file to store the alignments in + cactus_call(parameters=['paffy', 'invert', "-i", alignment_file_local], outfile=inverted_alignment_file_local, outappend=True, + job_memory=job.memory) + job.fileStore.deleteGlobalFile(alignment_file) + return job.fileStore.writeGlobalFile(inverted_alignment_file_local) + + +def make_ingroup_to_outgroup_alignments_0(job, ingroup_event, outgroup_events, event_names_to_sequences, distances, params): + # Generate the alignments fle + alignment_file = job.addChildJobFn(make_ingroup_to_outgroup_alignments_1, ingroup_event, outgroup_events, + event_names_to_sequences, distances, params).rv() + + # Invert the final alignment so that the query is the outgroup and the target is the ingroup + return job.addFollowOnJobFn(invert_alignments, alignment_file).rv() + + def make_ingroup_to_outgroup_alignments_1(job, ingroup_event, outgroup_events, event_names_to_sequences, distances, params): # a job should never set its own follow-on, so we hang everything off root_job here to encapsulate root_job = Job() @@ -265,8 +288,56 @@ def make_ingroup_to_outgroup_alignments_3(job, ingroup_event, ingroup_seq_file, return job.fileStore.writeGlobalFile(merged_alignments) -def chain_alignments(job, alignment_files, alignment_names, reference_event_name, params): - # The following recapitulates the pipeline showing in paffy/tests/paf_pipeline_test.sh +def merge_alignments(job, alignment_file1, alignment_file2): + """" Merge together two alignment files """ + # Get a temporary directory to work in + work_dir = job.fileStore.getLocalTempDir() + + # The merged alignment file + merged_alignments_file = os.path.join(work_dir, 'output_alignments.paf') + + # Read the files to local disk and merge them together, don't bother to log + cactus_call(parameters=['cat', + job.fileStore.readGlobalFile(alignment_file1, os.path.join(work_dir, 'alignment_1.paf')), + job.fileStore.readGlobalFile(alignment_file2, os.path.join(work_dir, 'alignment_2.paf'))], + outfile=merged_alignments_file, job_memory=job.memory) + + # Write the merged file back to the job store and return its path + return job.fileStore.writeGlobalFile(merged_alignments_file) + + +def chain_alignments_splitting_ingroups_and_outgroups(job, ingroup_alignment_files, ingroup_alignment_names, + outgroup_alignment_files, outgroup_alignment_names, + reference_event_name, params): + """ Chains/tiles/etc. the ingroup alignments to each other so that every ingroup sequence has a primary alignment to + another ingroup sequence, and separately each outgroup sequence has a primary alignment to an ingroup.""" + + # Check we have the expected number of alignment files + assert len(ingroup_alignment_files) == len(ingroup_alignment_names) + assert len(outgroup_alignment_files) == len(outgroup_alignment_names) + + # Chain and pick the primary alignments of the ingroups to each other + chained_ingroup_alignments = job.addChildJobFn(chain_alignments, ingroup_alignment_files, + ingroup_alignment_names, reference_event_name, params).rv() + + # Separately pick the primary of the outgroups to the ingroups. By setting include_inverted_alignments=False + # we only get outgroup-to-ingroup alignments and not imgroup-to-ouygroup alignments and therefore primary + # alignments along the outgroup sequences + chained_outgroup_alignments = job.addChildJobFn(chain_alignments, outgroup_alignment_files, + outgroup_alignment_names, reference_event_name, params, + include_inverted_alignments=False).rv() + + # Calculate approximately total alignment file size + total_file_size = sum(alignment_file.size for alignment_file in ingroup_alignment_files + outgroup_alignment_files) + + # Merge the resulting two alignment files into a single set of alignments + return job.addFollowOnJobFn(merge_alignments, chained_ingroup_alignments, chained_outgroup_alignments, + disk=total_file_size*10).rv() + + +def chain_alignments(job, alignment_files, alignment_names, reference_event_name, params, + include_inverted_alignments=True): + """The following recapitulates the pipeline showing in paffy/tests/paf_pipeline_test.sh""" root_job = Job() job.addChild(root_job) @@ -276,7 +347,8 @@ def chain_alignments(job, alignment_files, alignment_names, reference_event_name # do the chaining chained_alignment_files = [] for alignment_file, alignment_name in zip(alignment_files, alignment_names): - chained_alignment_files.append(root_job.addChildJobFn(chain_one_alignment, alignment_file, alignment_name, params, + chained_alignment_files.append(root_job.addChildJobFn(chain_one_alignment, alignment_file, alignment_name, + params, include_inverted_alignments, disk=6*alignment_file.size, memory=32*alignment_file.size).rv()) @@ -285,8 +357,11 @@ def chain_alignments(job, alignment_files, alignment_names, reference_event_name disk=6*sum([alignment_file.size for alignment_file in alignment_files]), memory=32*sum([alignment_file.size for alignment_file in alignment_files])).rv() -def chain_one_alignment(job, alignment_file, alignment_name, params): - # run paffy chain on one PAF + +def chain_one_alignment(job, alignment_file, alignment_name, params, include_inverted_alignments): + """ run paffy chain on one PAF. include_inverted_alignemnts is a flag to control if we additionally include + the inverted paf records for chaining. + """ work_dir = job.fileStore.getLocalTempDir() alignment_path = os.path.join(work_dir, alignment_name + '.paf') @@ -296,13 +371,15 @@ def chain_one_alignment(job, alignment_file, alignment_name, params): # Copy the alignments from the job store job.fileStore.readGlobalFile(alignment_file, alignment_path) - # Get the forward and reverse versions of each alignment for symmetry with chaining - shutil.copyfile(alignment_path, alignment_inv_path) - cactus_call(parameters=['paffy', 'invert', "-i", alignment_path], outfile=alignment_inv_path, outappend=True, - job_memory=job.memory) + # Get the forward and reverse versions of each alignment for symmetry with chaining if include_inverted_alignments + # is set + if include_inverted_alignments: + shutil.copyfile(alignment_path, alignment_inv_path) + cactus_call(parameters=['paffy', 'invert', "-i", alignment_inv_path], outfile=alignment_path, outappend=True, + job_memory=job.memory) # Now chain the alignments - cactus_call(parameters=['paffy', 'chain', "-i", alignment_inv_path, + cactus_call(parameters=['paffy', 'chain', "-i", alignment_path, "--maxGapLength", params.find("blast").attrib["chainMaxGapLength"], "--chainGapOpen", params.find("blast").attrib["chainGapOpen"], "--chainGapExtend", params.find("blast").attrib["chainGapExtend"], @@ -400,6 +477,7 @@ def sanitize_then_make_paf_alignments(job, event_tree_string, event_names_to_seq paf_job = sanitize_job.addFollowOnJobFn(make_paf_alignments, event_tree_string, sanitize_job.rv(), ancestor_event_string, params) return paf_job.rv() + def make_paf_alignments(job, event_tree_string, event_names_to_sequences, ancestor_event_string, params): # a job should never set its own follow-on, so we hang everything off the root_job here to encapsulate root_job = Job() @@ -441,11 +519,12 @@ def make_paf_alignments(job, event_tree_string, event_names_to_sequences, ancest # for each ingroup make alignments to the outgroups if int(params.find("blast").attrib["trimIngroups"]): # Trim the ingroup sequences - outgroup_alignments = [root_job.addChildJobFn(make_ingroup_to_outgroup_alignments_1, ingroup, outgroup_events, + outgroup_alignments = [root_job.addChildJobFn(make_ingroup_to_outgroup_alignments_0, ingroup, outgroup_events, dict(event_names_to_sequences), distances, params).rv() for ingroup in ingroup_events] if len(outgroup_events) > 0 else [] else: outgroup_alignments = [root_job.addChildJobFn(make_chunked_alignments, + # Ingroup will be the target, outgroup the query ingroup.iD, event_names_to_sequences[ingroup.iD], outgroup.iD, event_names_to_sequences[outgroup.iD], distances[ingroup, outgroup], params, @@ -454,10 +533,18 @@ def make_paf_alignments(job, event_tree_string, event_names_to_sequences, ancest # for better logs outgroup_alignment_names = ['{}-og_{}'.format(ancestor_event_string, i) for i in range(len(outgroup_alignments))] - # Now do the chaining - return root_job.addFollowOnJobFn(chain_alignments, ingroup_alignments + outgroup_alignments, - ingroup_alignment_names + outgroup_alignment_names, - ancestor_event_string, params).rv() + # Now do the chaining, either getting ingroup primary alignments separately to primary ingroup to outgroup + # alignments if the following option is true, otherwise get every sequence to pick its primary alignment without + # regard to whether the other sequence is an ingroup or an outgroup + if int(params.find("blast").attrib["pickIngroupPrimaryAlignmentsSeparatelyToOutgroups"]): + return root_job.addFollowOnJobFn(chain_alignments_splitting_ingroups_and_outgroups, + ingroup_alignments, ingroup_alignment_names, + outgroup_alignments, outgroup_alignment_names, + ancestor_event_string, params).rv() + + return job.addFollowOnJobFn(chain_alignments, ingroup_alignments + outgroup_alignments, + ingroup_alignment_names + outgroup_alignment_names, + ancestor_event_string, params).rv() def trim_unaligned_sequences(job, sequences, alignments, params): diff --git a/src/cactus/preprocessor/cactus_preprocessor.py b/src/cactus/preprocessor/cactus_preprocessor.py index 82ab661df..0f73bd91f 100644 --- a/src/cactus/preprocessor/cactus_preprocessor.py +++ b/src/cactus/preprocessor/cactus_preprocessor.py @@ -43,11 +43,12 @@ from cactus.preprocessor.dnabrnnMasking import DnabrnnMaskJob, loadDnaBrnnModel from cactus.preprocessor.cutHeaders import CutHeadersJob from cactus.preprocessor.fileMasking import maskJobOverride, FileMaskingJob +from cactus.progressive.cactus_prepare import human2bytesN class PreprocessorOptions: def __init__(self, chunkSize, memory, cpu, check, proportionToSample, unmask, preprocessJob, checkAssemblyHub=None, lastzOptions=None, minPeriod=None, - gpu=0, dnabrnnOpts=None, + gpu=0, lastz_memory=None, dnabrnnOpts=None, dnabrnnAction=None, eventName=None, minLength=None, cutBefore=None, cutBeforeOcc=None, cutAfter=None, inputBedID=None): self.chunkSize = chunkSize @@ -64,6 +65,7 @@ def __init__(self, chunkSize, memory, cpu, check, proportionToSample, unmask, self.gpuLastzInterval = self.chunkSize if self.gpu: self.chunkSize = 0 + self.lastz_memory= lastz_memory self.dnabrnnOpts = dnabrnnOpts self.dnabrnnAction = dnabrnnAction assert dnabrnnAction in ('softmask', 'hardmask', 'clip') @@ -146,6 +148,7 @@ def getChunkedJobForCurrentStage(self, seqIDs, proportionSampled, inChunkID, chu lastzOpts=self.prepOptions.lastzOptions, gpu=self.prepOptions.gpu, cpu=self.prepOptions.cpu, + lastz_memory=self.prepOptions.lastz_memory, gpuLastzInterval=self.prepOptions.gpuLastzInterval, eventName='{}_{}'.format(self.prepOptions.eventName, chunk_i)) return LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, @@ -256,6 +259,7 @@ def run(self, fileStore): minPeriod = getOptionalAttrib(prepNode, "minPeriod", typeFn=int, default=0), checkAssemblyHub = getOptionalAttrib(prepNode, "checkAssemblyHub", typeFn=bool, default=False), gpu = getOptionalAttrib(prepNode, "gpu", typeFn=int, default=0), + lastz_memory = getOptionalAttrib(prepNode, "lastz_memory", typeFn=int, default=None), dnabrnnOpts = getOptionalAttrib(prepNode, "dna-brnnOpts", default=""), dnabrnnAction = getOptionalAttrib(prepNode, "action", typeFn=str, default="softmask"), eventName = getOptionalAttrib(prepNode, "eventName", typeFn=str, default=None), @@ -341,7 +345,7 @@ def run(self, fileStore): def stageWorkflow(outputSequenceDir, configNode, inputSequences, toil, restart=False, outputSequences = [], maskMode=None, maskAction=None, maskFile=None, minLength=None, inputEventNames=None, brnnCores=None, - gpu_override=False): + gpu_override=False, options=None): #Replace any constants if not outputSequences: outputSequences = CactusPreprocessor.getOutputSequenceFiles(inputSequences, outputSequenceDir) @@ -352,7 +356,7 @@ def stageWorkflow(outputSequenceDir, configNode, inputSequences, toil, restart=F loadDnaBrnnModel(toil, configNode, maskAlpha = maskMode == 'brnn') if configNode.find("constants") != None: - ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() + ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals(options) if maskMode: lastz = maskMode == 'lastz' brnn = maskMode == 'brnn' @@ -406,7 +410,7 @@ def runCactusPreprocessor(outputSequenceDir, configFile, inputSequences, toilDir toilOptions.logLevel = "INFO" toilOptions.disableCaching = True with Toil(toilOptions) as toil: - stageWorkflow(outputSequenceDir, ET.parse(options.configFile).getroot(), inputSequences, toil) + stageWorkflow(outputSequenceDir, ET.parse(options.configFile).getroot(), inputSequences, toil, options=toilOptions) def main(): parser = ArgumentParser() @@ -432,7 +436,11 @@ def main(): parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--gpu", nargs='?', const='all', default=None, help="toggle on GPU-enabled lastz, and specify number of GPUs (all available if no value provided)") - parser.add_argument("--lastzCores", type=int, default=None, help="Number of cores for each lastz job, only relevant when running with --gpu") + parser.add_argument("--lastzCores", type=int, default=None, help="Number of cores for each lastz/segalign job, only relevant when running with --gpu") + parser.add_argument("--lastzMemory", type=human2bytesN, + help="Memory in bytes for each lastz/segalign job (defaults to an estimate based on the input data size). " + "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) + parser.add_argument("--pangenome", action="store_true", help='Do not mask. Just add Cactus-style unique prefixes and strip anything up to and including last #') options = parser.parse_args() @@ -552,7 +560,8 @@ def main(): maskAction=options.maskAction, minLength=options.minLength, inputEventNames=inNames, - brnnCores=options.brnnCores) + brnnCores=options.brnnCores, + options=options) if __name__ == '__main__': main() diff --git a/src/cactus/preprocessor/lastzRepeatMasking/cactus_lastzRepeatMask.py b/src/cactus/preprocessor/lastzRepeatMasking/cactus_lastzRepeatMask.py index a8ed28b5d..4d37e5633 100644 --- a/src/cactus/preprocessor/lastzRepeatMasking/cactus_lastzRepeatMask.py +++ b/src/cactus/preprocessor/lastzRepeatMasking/cactus_lastzRepeatMask.py @@ -25,6 +25,7 @@ def __init__(self, proportionSampled=1.0, gpu=0, cpu=None, + lastz_memory=None, gpuLastzInterval=3000000, eventName='seq'): self.fragment = fragment @@ -35,6 +36,7 @@ def __init__(self, self.proportionSampled = proportionSampled self.gpu = gpu self.cpu = cpu + self.lastz_memory = lastz_memory self.gpuLastzInterval = gpuLastzInterval self.eventName = eventName @@ -48,8 +50,13 @@ def __init__(self, class LastzRepeatMaskJob(RoundedJob): def __init__(self, repeatMaskOptions, queryID, targetIDs): targetsSize = sum(targetID.size for targetID in targetIDs) - memory = 4*1024*1024*1024 - disk = 4*(queryID.size + targetsSize) + if repeatMaskOptions.lastz_memory: + memory = repeatMaskOptions.lastz_memory + elif repeatMaskOptions.gpu: + memory = min(40 * targetsSize, 512e9) + else: + memory = 4*1024*1024*1024 + disk = max(4*(queryID.size + targetsSize), memory) cores = repeatMaskOptions.cpu accelerators = ['cuda:{}'.format(repeatMaskOptions.gpu)] if repeatMaskOptions.gpu else None RoundedJob.__init__(self, memory=memory, disk=disk, cores=cores, accelerators=accelerators, preemptable=True) diff --git a/src/cactus/progressive/cactus_prepare.py b/src/cactus/progressive/cactus_prepare.py index dae86c615..267b7c569 100644 --- a/src/cactus/progressive/cactus_prepare.py +++ b/src/cactus/progressive/cactus_prepare.py @@ -25,7 +25,7 @@ from cactus.progressive.progressive_decomposition import compute_outgroups, get_subtree, check_branch_lengths from cactus.shared.common import cactusRootPath from cactus.shared.common import enableDumpStack, setupBinaries -from cactus.shared.common import getDockerImage, getDockerRelease +from cactus.shared.common import getDockerImage from cactus.shared.configWrapper import ConfigWrapper from cactus.shared.version import cactus_commit from cactus.shared.common import findRequiredNode @@ -746,7 +746,7 @@ def wdl_task_preprocess(options): s += ' gpuCount: {}\n'.format(options.gpu) s += ' bootDiskSizeGb: 20\n' s += ' nvidiaDriverVersion: \"{}\"\n'.format(options.nvidiaDriver) - s += ' docker: \"{}\"\n'.format(getDockerRelease(gpu=True)) + s += ' docker: \"{}\"\n'.format(getDockerImage(gpu=True)) s += ' zones: \"{}\"\n'.format(options.gpuZone) else: s += ' docker: \"{}\"\n'.format(options.dockerImage) @@ -828,7 +828,7 @@ def wdl_task_blast(options): s += ' gpuCount: {}\n'.format(options.gpu) s += ' bootDiskSizeGb: 20\n' s += ' nvidiaDriverVersion: \"{}\"\n'.format(options.nvidiaDriver) - s += ' docker: \"{}\"\n'.format(getDockerRelease(gpu=True)) + s += ' docker: \"{}\"\n'.format(getDockerImage(gpu=True)) s += ' zones: \"{}\"\n'.format(options.gpuZone) else: s += ' docker: \"{}\"\n'.format(options.dockerImage) diff --git a/src/cactus/progressive/cactus_progressive.py b/src/cactus/progressive/cactus_progressive.py index dedfd6f69..0d39dba54 100755 --- a/src/cactus/progressive/cactus_progressive.py +++ b/src/cactus/progressive/cactus_progressive.py @@ -206,7 +206,7 @@ def progressive_step_2(job, trimmed_outgroups_and_alignments, options, config_no def export_hal(job, mc_tree, config_node, seq_id_map, og_map, results, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False, - checkpointInfo=None, acyclicEvent=None, has_resources=False): + checkpointInfo=None, acyclicEvent=None, has_resources=False, memory_override=None): # todo: going through list nonsense because (i think) it helps with promises, should at least clean up work_dir = job.fileStore.getLocalTempDir() @@ -268,8 +268,8 @@ def export_hal(job, mc_tree, config_node, seq_id_map, og_map, results, event=Non disk = 3 * sum([file_id.size for file_id in fa_file_ids + c2h_file_ids]) mem = 5 * (max([file_id.size for file_id in fa_file_ids]) + max([file_id.size for file_id in c2h_file_ids])) # allows pass-through of memory override from --consMemory - if job.memory: - mem = job.memory + if memory_override: + mem = memory_override return job.addChildJobFn(export_hal, mc_tree, config_node, seq_id_map, og_map, results, event=event, cacheBytes=cacheBytes, cacheMDC=cacheMDC, cacheRDC=cacheRDC, cacheW0=cacheW0, chunk=chunk, deflate=deflate, inMemory=inMemory, checkpointInfo=checkpointInfo, @@ -310,7 +310,7 @@ def progressive_workflow(job, options, config_node, mc_tree, og_map, input_seq_i # then do the hal export hal_export_job = progressive_job.addFollowOnJobFn(export_hal, mc_tree, config_node, seq_id_map, og_map, - progressive_job.rv(), event=root_event, memory=options.consMemory) + progressive_job.rv(), event=root_event, memory_override=options.consMemory) return hal_export_job.rv() @@ -340,7 +340,10 @@ def main(): parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--gpu", nargs='?', const='all', default=None, help="toggle on GPU-enabled lastz, and specify number of GPUs (all available if no value provided)") - parser.add_argument("--lastzCores", type=int, default=None, help="Number of cores for each lastz job, only relevant when running with --gpu") + parser.add_argument("--lastzCores", type=int, default=None, help="Number of cores for each lastz/segalign job, only relevant when running with --gpu") + parser.add_argument("--lastzMemory", type=human2bytesN, + help="Memory in bytes for each lastz/segalign job (defaults to an estimate based on the input data size). " + "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) parser.add_argument("--consCores", type=int, help="Number of cores for each cactus_consolidated job (defaults to all available / maxCores on single_machine)", default=None) parser.add_argument("--consMemory", type=human2bytesN, @@ -398,7 +401,7 @@ def main(): # load up the seqfile and figure out the outgroups and schedule config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) config_wrapper.setSystemMemory(options) # apply gpu override diff --git a/src/cactus/progressive/outgroup.py b/src/cactus/progressive/outgroup.py index 873ab61bc..a222c0a55 100644 --- a/src/cactus/progressive/outgroup.py +++ b/src/cactus/progressive/outgroup.py @@ -253,7 +253,7 @@ def main(): # can't use progressive_decomposition.py for circular deps... config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) seq_file = SeqFile(args[0]) mc_tree = MultiCactusTree(seq_file.tree) mc_tree.nameUnlabeledInternalNodes(config_wrapper.getDefaultInternalNodePrefix()) diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 68447333e..077ba4884 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -111,7 +111,7 @@ def graph_map(options): paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log = toil.restart() else: # load up the seqfile and figure out the outgroups and schedule - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper, pangenome=True) og_map = compute_outgroups(mc_tree, config_wrapper, set(og_candidates)) event_set = get_event_set(mc_tree, config_wrapper, og_map, mc_tree.getRootName()) @@ -429,7 +429,7 @@ def extract_paf_from_gfa(job, gfa_id, gfa_path, ref_event, graph_event, ignore_p cactus_call(parameters=cmd, outfile=paf_path) return job.fileStore.writeGlobalFile(paf_path) -def filter_paf(job, paf_id, config): +def filter_paf(job, paf_id, config, reference=None): """ run basic paf-filtering. these are quick filters that are best to do on-the-fly when reading the paf and as such, they are called by cactus-graphmap-split and cactus-align, not here """ work_dir = job.fileStore.getLocalTempDir() @@ -444,6 +444,7 @@ def filter_paf(job, paf_id, config): with open(paf_path, 'r') as paf_file, open(filter_paf_path, 'w') as filter_paf_file: for line in paf_file: toks = line.split('\t') + is_ref = reference and toks[0].startswith('id={}|'.format(reference)) mapq = int(toks[11]) query_len = int(toks[1]) ident = float(toks[9]) / (float(toks[10]) + 0.00000001) @@ -456,7 +457,7 @@ def filter_paf(job, paf_id, config): # we can also get the identity of the parent gaf block if tok.startswith('gi:i:'): ident = min(ident, float(toks[5:])) - if mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident: + if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident): filter_paf_file.write(line) overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index a42724ce3..8a6b4e78d 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -320,7 +320,7 @@ def graphmap_join(options): #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) - config.substituteAllPredefinedConstantsWithLiterals() + config.substituteAllPredefinedConstantsWithLiterals(options) # load up the vgs vg_ids = [] @@ -613,7 +613,7 @@ def clip_vg(job, options, config, vg_path, vg_id, phase): cactus_call(parameters=['vg', 'convert', '-W', '-f', vg_path], outfile=gfa_in_path, job_memory=job.memory) fix_cmd = ['gfaffix', gfa_in_path, '--output_refined', gfa_out_path, '--check_transformation'] if options.reference: - fix_cmd += ['--dont_collapse', options.reference[0] + '*'] + fix_cmd += ['--dont_collapse', options.reference[0] + '#[.]*'] cactus_call(parameters=fix_cmd, job_memory=job.memory) # GFAffix strips the header, until this is fixed we need to add it back (for the RS tag) gfa_header = cactus_call(parameters=['head', '-1', gfa_in_path], check_output=True).strip() diff --git a/src/cactus/refmap/cactus_graphmap_split.py b/src/cactus/refmap/cactus_graphmap_split.py index f08f333c6..1ab36bf31 100644 --- a/src/cactus/refmap/cactus_graphmap_split.py +++ b/src/cactus/refmap/cactus_graphmap_split.py @@ -100,7 +100,7 @@ def cactus_graphmap_split(options): #load cactus config config_node = ET.parse(options.configFile).getroot() config = ConfigWrapper(config_node) - config.substituteAllPredefinedConstantsWithLiterals() + config.substituteAllPredefinedConstantsWithLiterals(options) #Run the workflow if options.restart: @@ -225,7 +225,8 @@ def graphmap_split_workflow(job, options, config, seq_id_map, seq_name_map, gfa_ # do some basic paf filtering paf_filter_mem = max(paf_id.size * 10, 2**32) - paf_filter_job = root_job.addFollowOnJobFn(filter_paf, paf_id, config, disk = paf_id.size * 10, memory=paf_filter_mem) + paf_filter_job = root_job.addFollowOnJobFn(filter_paf, paf_id, config, reference=options.reference, + disk = paf_id.size * 10, memory=paf_filter_mem) paf_id = paf_filter_job.rv() root_job = paf_filter_job diff --git a/src/cactus/refmap/cactus_minigraph.py b/src/cactus/refmap/cactus_minigraph.py index 60c8a7011..2039f04e1 100644 --- a/src/cactus/refmap/cactus_minigraph.py +++ b/src/cactus/refmap/cactus_minigraph.py @@ -24,12 +24,14 @@ from cactus.shared.common import cactus_call from cactus.shared.common import getOptionalAttrib, findRequiredNode from cactus.shared.version import cactus_commit +from cactus.progressive.cactus_prepare import human2bytesN from cactus.preprocessor.checkUniqueHeaders import sanitize_fasta_headers from toil.job import Job from toil.common import Toil from toil.statsAndLogging import logger from toil.statsAndLogging import set_logging_from_options from toil.realtimeLogger import RealtimeLogger +from toil.lib.conversions import bytes2human from cactus.shared.common import cactus_cpu_count from cactus.progressive.multiCactusTree import MultiCactusTree from sonLib.bioio import getTempDirectory @@ -43,7 +45,10 @@ def main(): parser.add_argument("--reference", required=True, nargs='+', type=str, help = "Reference genome name(s) (added to minigraph first). Mash distance to 1st reference to determine order of other genomes (use minigraphSortInput in the config xml to toggle this behavior).") parser.add_argument("--mgCores", type=int, help = "Number of cores for minigraph construction (defaults to the same as --maxCores).") - + parser.add_argument("--mgMemory", type=human2bytesN, + help="Memory in bytes for the minigraph construction job (defaults to an estimate based on the input data size). " + "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) + #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", @@ -79,7 +84,7 @@ def main(): # load up the seqfile config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile @@ -241,6 +246,8 @@ def parse_mash_output(mash_output): catFiles(query_paths, cat_path) cat_mash_output = cactus_call(parameters=['mash', 'dist', cat_path, ref_sketch_path], check_output=True) cat_mash_dist = parse_mash_output(cat_mash_output) + # we want samples to stay together in the event of ties (which happen). So add a a little bit to make unique + cat_mash_dist += float(sorted(seq_id_map.keys()).index(query_seqs[0])) * sys.float_info.epsilon # make the individual distance for query_seq, query_path in zip(query_seqs, query_paths): @@ -293,8 +300,11 @@ def minigraph_construct(job, options, config_node, seq_id_map, seq_order, gfa_pa max_size = max([x.size for x in seq_id_map.values()]) total_size = sum([x.size for x in seq_id_map.values()]) disk = total_size * 2 - mem = 128 * max_size + int(total_size / 4) - mem = max(mem, 2**30) + mem = 60 * max_size + int(total_size / 4) + mem = max(mem, 2**31) + if options.mgMemory is not None: + RealtimeLogger.info('Overriding minigraph_construct memory estimate of {} with {} value {} from --mgMemory'.format(bytes2human(mem), 'greater' if options.mgMemory > mem else 'lesser', bytes2human(options.mgMemory))) + mem = options.mgMemory return job.addChildJobFn(minigraph_construct, options, config_node, seq_id_map, seq_order, gfa_path, has_resources=True, disk=disk, memory=mem, cores=options.mgCores).rv() diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index dbef90d82..4b15b26e5 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -57,6 +57,9 @@ def main(): # cactus-minigraph options parser.add_argument("--mgCores", type=int, help = "Number of cores for minigraph construction (defaults to the same as --maxCores).") + parser.add_argument("--mgMemory", type=human2bytesN, + help="Memory in bytes for the minigraph construction job (defaults to an estimate based on the input data size). " + "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) # cactus-graphmap options parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph map. Overrides graphmap cpu in configuration") @@ -167,7 +170,7 @@ def main(): # load up the seqfile config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile diff --git a/src/cactus/refmap/cactus_refmap.py b/src/cactus/refmap/cactus_refmap.py index 081cda301..f6f451bd3 100644 --- a/src/cactus/refmap/cactus_refmap.py +++ b/src/cactus/refmap/cactus_refmap.py @@ -295,7 +295,7 @@ def main(): # load up the seqfile and figure out the outgroups and schedule config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper) og_map = compute_outgroups(mc_tree, config_wrapper, set(og_candidates)) event_set = get_event_set(mc_tree, config_wrapper, og_map, mc_tree.getRootName()) diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index 28371a8d8..82a8978fb 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -248,7 +248,7 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): else: config_node = ET.parse(options.configFile).getroot() config_wrapper = ConfigWrapper(config_node) - config_wrapper.substituteAllPredefinedConstantsWithLiterals() + config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) config_wrapper.initGPU(options) config_wrapper.setSystemMemory(options) @@ -387,12 +387,12 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map, # run the hal export hal_job = cons_job.addFollowOnJobFn(export_hal, sub_tree, config_wrapper.xmlRoot, new_seq_id_map, og_map, results, event=root_name, inMemory=True, checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents else None, - memory=cons_memory) + memory_override=cons_memory) # optionally create the VG if doVG or doGFA: vg_export_job = hal_job.addFollowOnJobFn(export_vg, hal_job.rv(), config_wrapper, doVG, doGFA, referenceEvents, - checkpointInfo=checkpointInfo, memory=cons_memory) + checkpointInfo=checkpointInfo, memory_override=cons_memory) vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1) else: vg_file_id, gfa_file_id = None, None @@ -400,7 +400,8 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map, return hal_job.rv(), vg_file_id, gfa_file_id -def export_vg(job, hal_id, config_wrapper, doVG, doGFA, referenceEvents, checkpointInfo=None, resource_spec = False): +def export_vg(job, hal_id, config_wrapper, doVG, doGFA, referenceEvents, checkpointInfo=None, resource_spec = False, + memory_override=None): """ use hal2vg to convert the HAL to vg format """ if not resource_spec: @@ -410,7 +411,7 @@ def export_vg(job, hal_id, config_wrapper, doVG, doGFA, referenceEvents, checkpo resource_spec = True, disk=hal_id.size * 3, # allow override with cons_memory - memory=hal_id.size * 60 if not job.memory else job.memory).rv() + memory=hal_id.size * 60 if not memory_override else memory_override).rv() work_dir = job.fileStore.getLocalTempDir() hal_path = os.path.join(work_dir, "out.hal") diff --git a/src/cactus/shared/common.py b/src/cactus/shared/common.py index 6afbdcf92..3d650ad1e 100644 --- a/src/cactus/shared/common.py +++ b/src/cactus/shared/common.py @@ -40,7 +40,7 @@ from toil.common import Toil from toil.job import Job from toil.realtimeLogger import RealtimeLogger -from toil.lib.humanize import bytes2human +from toil.lib.humanize import bytes2human, human2bytes from toil.lib.threading import cpu_count from sonLib.bioio import popenCatch from sonLib.bioio import getTempDirectory @@ -77,6 +77,26 @@ def cactus_override_toil_options(options): # lead to weird toil errors? # https://github.com/DataBiosphere/toil/issues/4218 options.disableCaching = True + + if 'CACTUS_INSIDE_CONTAINER' in os.environ and str(os.environ['CACTUS_INSIDE_CONTAINER']) == '1': + # some people get confused when trying to use their cluster from inside the cactus + # docker. it doesn't work (without tons of hackery) since slurm isnt in the image + if options.batchSystem.lower() not in ['single_machine', 'singleMachine']: + e = RuntimeError('"--batchSystem {}" is not supported from inside the Cactus Docker container. In order to use this batch system, you must run directly from a Python virutalenv on your head node. You can use --binariesMode docker to execute the various binaries using docker, but the cactus command itself must be run directly on your system where it will have access to your cluster environment (ex: sbatch).'.format(options.batchSystem)) + raise e + if options.binariesMode in ['docker', 'singularity']: + e = RuntimeError('"--binariesMode {}" is not supported from inside the Cactus Docker container, which does not support nested containerization. So if you are running Cactus from inside a Docker or Singularity container, please do not use the --binariesMode option.'.format(options.binariesMode)) + raise e + + if options.binariesMode in ['docker', 'singularity']: + # jobs, even when seemingly not using docker, sometimes disappear on + # slurm when using non-local binaries. to date, the solution has been + # to boost memory. this override is to help apply this globally to all + # the little jobs + opt_default_mem = options.defaultMemory if options.defaultMemory else 0 + if isinstance(opt_default_mem, str): + opt_default_mem = human2bytes(opt_default_mem) + options.defaultMemory = int(max(opt_default_mem, 2**32)) if not options.realTimeLogging: # Too much valuable debugging information to pass up @@ -268,20 +288,6 @@ def runGetChunks(sequenceFiles, chunksDir, chunkSize, overlapSize, work_dir=None "--dir", chunksDir] + sequenceFiles) return [chunk for chunk in chunks.split("\n") if chunk != ""] -def pullCactusImage(): - """Ensure that the cactus Docker image is pulled.""" - if os.environ.get('CACTUS_DOCKER_MODE') == "0": - return - if os.environ.get('CACTUS_USE_LOCAL_IMAGE', 0) == "1": - return - image = getDockerImage() - call = ["docker", "pull", image] - process = subprocess.Popen(call, stdout=subprocess.PIPE, - stderr=sys.stderr, bufsize=-1) - output, _ = process.communicate() - if process.returncode != 0: - raise RuntimeError("Command %s failed with output: %s" % (call, output)) - def getDockerOrg(): """Get where we should find the cactus containers.""" if "CACTUS_DOCKER_ORG" in os.environ: @@ -289,27 +295,22 @@ def getDockerOrg(): else: return "quay.io/comparative-genomics-toolkit" -def getDockerTag(): +def getDockerTag(gpu=False): """Get what docker tag we should use for the cactus image - (either forced to be latest or the current cactus commit).""" + Default: latest release (which is all there is on quay these days) + otherwise, either forced to be latest or the current cactus commit.""" if 'CACTUS_DOCKER_TAG' in os.environ: return os.environ['CACTUS_DOCKER_TAG'] elif 'CACTUS_USE_LATEST' in os.environ: return "latest" else: - return cactus_commit + # must be manually kept current with each release + return 'v2.6.13' + ('-gpu' if gpu else '') -def getDockerImage(): +def getDockerImage(gpu=False): """Get fully specified Docker image name.""" return "%s/cactus:%s" % (getDockerOrg(), getDockerTag()) -def getDockerRelease(gpu=False): - """Get the most recent docker release.""" - r = "quay.io/comparative-genomics-toolkit/cactus:v2.6.8" - if gpu: - r += "-gpu" - return r - def maxMemUsageOfContainer(containerInfo): """Return the max RSS usage (in bytes) of a container, or None if something failed.""" if containerInfo['id'] is None: @@ -435,13 +436,14 @@ def importSingularityImage(options): oldCWD = os.getcwd() os.chdir(os.path.dirname(imgPath)) # --size is deprecated starting in 2.4, but is needed for 2.3 support. Keeping it in for now. + gpu = bool(options.gpu) if 'gpu' in options else False try: subprocess.check_call(["singularity", "pull", "--size", "2000", "--name", os.path.basename(imgPath), - "docker://" + getDockerImage()], stderr=subprocess.PIPE) + "docker://" + getDockerImage(gpu=gpu)], stderr=subprocess.PIPE) except subprocess.CalledProcessError: # Call failed, try without --size, required for singularity 3+ subprocess.check_call(["singularity", "pull", "--name", os.path.basename(imgPath), - "docker://" + getDockerImage()]) + "docker://" + getDockerImage(gpu=gpu)]) os.chdir(oldCWD) else: logger.info("Using pre-built singularity image: '{}'".format(imgPath)) @@ -498,7 +500,7 @@ def singularityCommand(tool=None, # hack to transform back to docker image if tool == 'cactus': - tool = getDockerImage() + tool = getDockerImage(gpu = bool(gpus)) # not a url or local file? try it as a Docker specifier if not tool.startswith('/') and '://' not in tool: tool = 'docker://' + tool @@ -600,7 +602,7 @@ def dockerCommand(tool=None, if rm: base_docker_call.append('--rm') - docker_tag = getDockerTag() + docker_tag = getDockerTag(gpu=bool(gpus)) tool = "%s/%s:%s" % (dockstore, tool, docker_tag) call = base_docker_call + [tool] + parameters return call, containerInfo diff --git a/src/cactus/shared/configWrapper.py b/src/cactus/shared/configWrapper.py index b40299f39..b902e1dcc 100644 --- a/src/cactus/shared/configWrapper.py +++ b/src/cactus/shared/configWrapper.py @@ -150,9 +150,17 @@ def getDefaultMemory(self): constantsElem = self.xmlRoot.find("constants") return int(constantsElem.attrib["defaultMemory"]) - def substituteAllPredefinedConstantsWithLiterals(self): + def substituteAllPredefinedConstantsWithLiterals(self, options): constants = findRequiredNode(self.xmlRoot, "constants") defines = constants.find("defines") + if options.binariesMode in ['docker', 'singularity']: + # hack to boost default memory to 4 Gigs when using docker + for attrib_name in defines.attrib: + if attrib_name.endswith('Memory'): + mem_val = int(defines.attrib[attrib_name]) + assert not options.defaultMemory or isinstance(options.defaultMemory, int) + mem_val = max(mem_val, options.defaultMemory if options.defaultMemory else 2**32) + defines.attrib[attrib_name] = str(mem_val) def replaceAllConstants(node, defines): for attrib in node.attrib: if node.attrib[attrib] in defines.attrib: @@ -232,7 +240,9 @@ def initGPU(self, options): findRequiredNode(self.xmlRoot, "blast").attrib["gpu"] = str(options.gpu) for node in self.xmlRoot.findall("preprocessor"): if getOptionalAttrib(node, "preprocessJob") == "lastzRepeatMask": - node.attrib["gpu"] = str(options.gpu) + node.attrib["gpu"] = str(options.gpu) + if 'latest' in options and options.latest: + raise RuntimeError('--latest cannot be used with --gpu') # we need to make sure that gpu is set to an integer (replacing 'all' with a count) def get_gpu_count(): @@ -272,13 +282,16 @@ def get_gpu_count(): raise RuntimeError('Invalid value for repeatmask gpu count, {}. Please specify a numeric value with --gpu'.format(lastz_gpu)) node.attrib["gpu"] = str(pp_gpu) - # segalign still can't contorl the number of cores it uses (!). So we give all available on - # single machine. + if 'lastzCores' not in options: + options.lastzCores = None + if 'lastzMemory' not in options: + options.lastzMemory = None + lastz_cores = options.lastzCores if options.lastzCores else None + if getOptionalAttrib(findRequiredNode(self.xmlRoot, "blast"), 'gpu', typeFn=int, default=0): - if options.lastzCores: - lastz_cores = options.lastzCores - else: - # single machine: we give all the cores to segalign + if not lastz_cores: + # segalign still can't contorl the number of cores it uses (!). So we give all available on + # single machine. if options.batchSystem.lower() in ['single_machine', 'singlemachine']: if options.maxCores is not None: lastz_cores = options.maxCores @@ -286,10 +299,21 @@ def get_gpu_count(): lastz_cores = cactus_cpu_count() else: raise RuntimeError('--lastzCores must be used with --gpu on non-singlemachine batch systems') + + # override blast cores and memory if specified + if lastz_cores: findRequiredNode(self.xmlRoot, "blast").attrib["cpu"] = str(lastz_cores) - for node in self.xmlRoot.findall("preprocessor"): - if getOptionalAttrib(node, "preprocessJob") == "lastzRepeatMask": - node.attrib["cpu"] = str(lastz_cores) + if options.lastzMemory: + findRequiredNode(self.xmlRoot, "blast").attrib["lastz_memory"] = str(options.lastzMemory) + + # override preprocess-repeatmask cores and memory if specified + for node in self.xmlRoot.findall("preprocessor"): + if getOptionalAttrib(node, "preprocessJob") == "lastzRepeatMask": + if lastz_cores: + node.attrib["cpu"] = str(lastz_cores) + if options.lastzMemory: + # there is already a general "memory" option, but we need something lastz-specific + node.attrib["lastz_memory"] = str(options.lastzMemory) # make absolutely sure realign is never turned on with the gpu. they don't work together because # realign requires small chunks, and segalign needs big chunks diff --git a/submodules/hal b/submodules/hal index afe2d3267..f2068109d 160000 --- a/submodules/hal +++ b/submodules/hal @@ -1 +1 @@ -Subproject commit afe2d3267a64e1ebe7bdbbbb6ed75e273c1f5083 +Subproject commit f2068109dd4b95fa50516f52aa1627686a346d31 diff --git a/test/evolverTest.py b/test/evolverTest.py index 631345169..83ffe90c4 100644 --- a/test/evolverTest.py +++ b/test/evolverTest.py @@ -12,7 +12,7 @@ def setUp(self): self.tempDir = getTempDirectory(os.getcwd()) unittest.TestCase.setUp(self) self.cromwell = False - + def tearDown(self): unittest.TestCase.tearDown(self) if self.cromwell: @@ -55,7 +55,7 @@ def _update_evolver_add_to_node(self, binariesMode, configFile = None, new_leaf= children = subprocess.check_output(['halStats', hal_path, '--children', new_root]).strip().split() children = [c.decode("utf-8") for c in children] genomes = children + [new_root] - + # make the fasta files fa_paths = [os.path.join(self.tempDir, '{}.fa'.format(genome)) for genome in genomes] for genome, genome_path in zip(genomes, fa_paths): @@ -69,7 +69,7 @@ def _update_evolver_add_to_node(self, binariesMode, configFile = None, new_leaf= assert new_child_line # make the seqfile # todo: we could fish branch lengths out of the tree, but it's not really relevant to the test - tree_string = '({}){};'.format(','.join(children + [new_leaf]), new_root) + tree_string = '({}){};'.format(','.join(children + [new_leaf]), new_root) seq_file_path = os.path.join(self.tempDir, 'seqfile.{}'.format(new_root)) with open(seq_file_path, 'w') as seq_file: seq_file.write(tree_string + '\n') @@ -90,7 +90,7 @@ def _update_evolver_add_to_node(self, binariesMode, configFile = None, new_leaf= cmd += ['--latest'] sys.stderr.write('Running {}\n'.format(' '.format(cmd))) subprocess.check_call(cmd) - + cmd = ['cactus-align', self._job_store(binariesMode), seq_file_path, out_paf, out_hal, '--includeRoot', '--root', new_root, '--binariesMode', binariesMode, '--logInfo', '--realTimeLogging', '--workDir', self.tempDir] if configFile: @@ -101,7 +101,7 @@ def _update_evolver_add_to_node(self, binariesMode, configFile = None, new_leaf= sys.stderr.write('Running {}\n'.format(' '.format(cmd))) subprocess.check_call(cmd) else: - cmd = ['cactus', self._job_store(binariesMode), seq_file_path, out_hal, + cmd = ['cactus', self._job_store(binariesMode), seq_file_path, out_hal, '--binariesMode', binariesMode, '--logInfo', '--realTimeLogging', '--workDir', self.tempDir] if configFile: cmd += ['--configFile', configFile] @@ -139,29 +139,29 @@ def genome_path(genome): bottom_seq_file.write('{}\t{}\n'.format(genome, genome_path(genome))) # make the bottom hal - bottom_hal_path = os.path.join(self.tempDir, 'bottom.hal') + bottom_hal_path = os.path.join(self.tempDir, 'bottom.hal') subprocess.check_call(['cactus', self._job_store(binariesMode), bottom_seq_path, bottom_hal_path, '--root', 'AncGorilla', '--binariesMode', binariesMode, '--logInfo', '--realTimeLogging', '--workDir', self.tempDir]) # dump the new ancestor with open(genome_path('AncGorilla'), 'w') as anc_file: subprocess.check_call(['hal2fasta', bottom_hal_path, 'AncGorilla'], stdout=anc_file) - + # make the top seq_ile top_seq_path = os.path.join(self.tempDir, 'top.txt') with open(top_seq_path, 'w') as top_seq_file: top_seq_file.write('((simHuman_chr6:0.144018,AncGorilla:0.1)Anc1:0.020593,(simCow_chr6:0.18908,simDog_chr6:0.16303)Anc2:0.032898)Anc0;\n') for genome in ['simCow_chr6', 'simDog_chr6', 'simHuman_chr6', 'AncGorilla', 'Anc0', 'Anc1', 'Anc2']: top_seq_file.write('{}\t{}\n'.format(genome, genome_path(genome))) - + # make the top hal top_hal_path = os.path.join(self.tempDir, 'top.hal') subprocess.check_call(['cactus', self._job_store(binariesMode), top_seq_path, top_hal_path, '--root', 'Anc1', '--binariesMode', binariesMode, '--logInfo', '--realTimeLogging', '--workDir', self.tempDir]) - + # update the hal subprocess.check_call(['halAddToBranch', hal_path, bottom_hal_path, top_hal_path, 'Anc1', 'AncGorilla', 'mr', 'simGorilla', '0.1', '0.2']) - + def _run_evolver_in_docker(self, seqFile = './examples/evolverMammals.txt'): out_hal = self._out_hal('in_docker') @@ -170,11 +170,11 @@ def _run_evolver_in_docker(self, seqFile = './examples/evolverMammals.txt'): 'cactus /data/js /workdir/{} /data/{}'.format(seqFile, os.path.basename(out_hal))] sys.stderr.write('Running {}\n'.format(' '.format(cmd))) subprocess.check_call(' '.join(cmd), shell=True) - + def _write_primates_seqfile(self, seq_file_path): """ create the primates seqfile at given path""" with open(seq_file_path, 'w') as seq_file: - seq_file.write('simHuman\thttps://raw.githubusercontent.com/UCSantaCruzComputationalGenomicsLab/cactusTestData/master/evolver/primates/loci1/simHuman.chr6\n') + seq_file.write('simHuman\thttps://raw.githubusercontent.com/UCSantaCruzComputationalGenomicsLab/cactusTestData/master/evolver/primates/loci1/simHuman.chr6\n') seq_file.write('simChimp\thttps://raw.githubusercontent.com/UCSantaCruzComputationalGenomicsLab/cactusTestData/master/evolver/primates/loci1/simChimp.chr6\n') seq_file.write('simGorilla\thttps://raw.githubusercontent.com/UCSantaCruzComputationalGenomicsLab/cactusTestData/master/evolver/primates/loci1/simGorilla.chr6\n') seq_file.write('simOrang\thttps://raw.githubusercontent.com/UCSantaCruzComputationalGenomicsLab/cactusTestData/master/evolver/primates/loci1/simOrang.chr6\n') @@ -220,7 +220,7 @@ def _run_evolver_decomposed_wdl(self, name): # hack to use docker to remove filetree in teardown, as cromwell # can leave root files hanging around there self.cromwell = True - + out_seqfile = os.path.join(self.tempDir, 'evolverMammalsOut.txt') in_seqfile = './examples/evolverMammals.txt' out_wdl = os.path.join(self.tempDir, 'prepared.wdl') @@ -310,13 +310,13 @@ def _run_evolver_primates_refmap(self, binariesMode): # todo: it'd be nice to have an interface for setting tag to something not latest or commit if binariesMode == 'docker': cactus_opts += ['--latest'] - + subprocess.check_call(['cactus-refmap', self._job_store(binariesMode), seq_file_path, "simHuman", cigar_path] + cactus_opts) # do the alignment subprocess.check_call(['cactus-align', self._job_store(binariesMode), seq_file_path, cigar_path, self._out_hal(binariesMode), - '--pangenome'] + cactus_opts) - + '--pangenome'] + cactus_opts) + def _run_evolver_primates_graphmap(self, binariesMode): """ primates star test but using graphmap pangenome pipeline @@ -328,7 +328,7 @@ def _run_evolver_primates_graphmap(self, binariesMode): # use the same logic cactus does to get default config config_path = 'src/cactus/cactus_progressive_config.xml' - + xml_root = ET.parse(config_path).getroot() bar_elem = xml_root.find("bar") bar_elem.attrib["partialOrderAlignment"] = "1" @@ -345,7 +345,7 @@ def _run_evolver_primates_graphmap(self, binariesMode): subprocess.check_call(['cactus-prepare', seq_file_path, '--outDir', os.path.dirname(out_seq_file_path)]) cactus_opts = ['--binariesMode', binariesMode, '--logInfo', '--realTimeLogging', '--workDir', self.tempDir, '--configFile', poa_config_path] - + # preprocess with dna-brnn subprocess.check_call(['cactus-preprocess', self._job_store(binariesMode), seq_file_path, out_seq_file_path, '--maskMode', 'brnn'] + cactus_opts) @@ -368,7 +368,7 @@ def _run_evolver_primates_graphmap(self, binariesMode): # todo: it'd be nice to have an interface for setting tag to something not latest or commit if binariesMode == 'docker': cactus_opts += ['--latest'] - + subprocess.check_call(['cactus-graphmap', self._job_store(binariesMode), out_seq_file_path, mg_path, paf_path, '--outputFasta', fa_path, '--mapCores', '4'] + cactus_opts) @@ -404,7 +404,7 @@ def _run_yeast_pangenome_step_by_step(self, binariesMode): # make a config that replaces stuff like SC4.chrI with id=SC4|chrI # also run dna brnn with softmasking orig_seq_file_path = './examples/yeastPangenome.txt' - seq_file_path = os.path.join(self.tempDir, 'pp', os.path.basename(orig_seq_file_path)) + seq_file_path = os.path.join(self.tempDir, 'pp', os.path.basename(orig_seq_file_path)) subprocess.check_call(['cactus-prepare', orig_seq_file_path, '--outDir', os.path.join(self.tempDir, 'pp'), '--seqFileOnly']) cactus_opts = ['--binariesMode', binariesMode, '--logInfo', '--realTimeLogging', '--workDir', self.tempDir, '--maxCores', '4'] subprocess.check_call(['cactus-preprocess', self._job_store(binariesMode), @@ -417,18 +417,18 @@ def _run_yeast_pangenome_step_by_step(self, binariesMode): toks = line.strip().split() if len(toks) == 2: events.append(toks[0]) - + # build the minigraph mg_path = os.path.join(self.tempDir, 'yeast.sv.gfa.gz') mg_cmd = ['cactus-minigraph', self._job_store(binariesMode), seq_file_path, mg_path, '--reference', 'S288C'] + cactus_opts subprocess.check_call(mg_cmd) - + # run graphmap in base mode paf_path = os.path.join(self.tempDir, 'yeast.paf') - fa_path = os.path.join(self.tempDir, 'yeast.sv.gfa.fa') + fa_path = os.path.join(self.tempDir, 'yeast.sv.gfa.fa') subprocess.check_call(['cactus-graphmap', self._job_store(binariesMode), seq_file_path, mg_path, paf_path, '--outputFasta', fa_path, '--delFilter', '2000000'] + cactus_opts + ['--mapCores', '1']) - + # split into chromosomes chroms = ['chrI', 'chrII', 'chrIII', 'chrIV', 'chrV', 'chrVI', 'chrVII', 'chrVIII', 'chrIX', 'chrX', 'chrXI', 'chrXIV', 'chrXV'] gm_out_dir = os.path.join(self.tempDir, 'chroms') @@ -451,7 +451,7 @@ def _run_yeast_pangenome_step_by_step(self, binariesMode): # join up the graphs and index for giraffe join_path = os.path.join(self.tempDir, 'join') subprocess.check_call(['cactus-graphmap-join', self._job_store(binariesMode), '--outDir', join_path, '--outName', 'yeast', - '--reference', 'S288C', '--vg'] + vg_files + ['--hal'] + hal_files + + '--reference', 'S288C', '--vg'] + vg_files + ['--hal'] + hal_files + ['--vcf', '--giraffe', 'clip', 'filter'] + cactus_opts + ['--indexCores', '4']) def _run_yeast_pangenome(self, binariesMode): @@ -466,13 +466,13 @@ def _run_yeast_pangenome(self, binariesMode): chroms = ['chrI', 'chrII', 'chrIII', 'chrIV', 'chrV', 'chrVI', 'chrVII', 'chrVIII', 'chrIX', 'chrX', 'chrXI', 'chrXIV', 'chrXV'] cactus_pangenome_cmd = ['cactus-pangenome', self._job_store(binariesMode), orig_seq_file_path, '--outDir', join_path, '--outName', 'yeast', - '--refContigs'] + chroms + ['--reference', 'S288C', 'DBVPG6044', '--vcf', '--vcfReference','DBVPG6044', 'S288C', + '--refContigs'] + chroms + ['--reference', 'S288C', 'DBVPG6044', '--vcf', '--vcfReference','DBVPG6044', 'S288C', '--giraffe', 'clip', 'filter', '--chrom-vg', 'clip', 'filter', '--viz', '--chrom-og', 'clip', 'full', '--odgi', '--haplo', 'clip', '--rgfa', '--indexCores', '4', '--consCores', '2'] subprocess.check_call(cactus_pangenome_cmd + cactus_opts) - #compatibility with older test + #compatibility with older test subprocess.check_call(['mkdir', '-p', os.path.join(self.tempDir, 'chroms')]) subprocess.check_call(['mv', os.path.join(join_path, 'chrom-subproblems', 'contig_sizes.tsv'), os.path.join(self.tempDir, 'chroms')]) @@ -535,7 +535,7 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False for giraffe_idx in ['yeast.gbz', 'yeast.dist', 'yeast.min', 'yeast.d2.gbz', 'yeast.d2.dist', 'yeast.d2.min']: idx_bytes = os.path.getsize(os.path.join(join_path, giraffe_idx)) self.assertGreaterEqual(idx_bytes, 500000) - + if expect_haplo: # make sure we have the haplo indexes: for haplo_idx in ['yeast.ri', 'yeast.hapl']: @@ -576,13 +576,13 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False self.assertEqual(len(sizes), 10) self.assertGreaterEqual(int(sizes[9]), 200000) - # make sure the gbz stats are sane + # make sure the gbz stats are sane clip_degree_dist = subprocess.check_output(['vg', 'stats', '-D', os.path.join(join_path, 'yeast.gbz')]).strip().decode('utf-8') - clip_degree_dist = len(clip_degree_dist.strip().split('\n')) + clip_degree_dist = len(clip_degree_dist.strip().split('\n')) self.assertGreaterEqual(clip_degree_dist, 6) filter_degree_dist = subprocess.check_output(['vg', 'stats', '-D', os.path.join(join_path, 'yeast.d2.gbz')]).strip().decode('utf-8') - filter_degree_dist = len(filter_degree_dist.strip().split('\n')) + filter_degree_dist = len(filter_degree_dist.strip().split('\n')) self.assertGreaterEqual(filter_degree_dist, 6) self.assertLess(filter_degree_dist, clip_degree_dist) @@ -615,8 +615,8 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False self.assertGreaterEqual(os.path.getsize(os.path.join(join_path, 'yeast.viz', chr + '.full.viz.png')), 700) self.assertGreaterEqual(os.path.getsize(os.path.join(join_path, 'yeast.chroms', chr + '.full.og')), 3000000) self.assertGreaterEqual(os.path.getsize(os.path.join(join_path, 'yeast.chroms', chr + '.og')), 3000000) - - + + def _csvstr_to_table(self, csvstr, header_fields): """ Hacky csv parse """ output_stats = {} @@ -736,7 +736,7 @@ def _check_maf_accuracy(self, halPath, delta, dataset="mammals"): # run it with --dupeMode consensus just to make sure it doesn't crash subprocess.check_call(['cactus-hal2maf', self._job_store('h2m'), halPath, halPath + '.consensus.maf.gz', '--chunkSize', '10000', '--batchCount', '2', '--refGenome', 'Anc0', '--dupeMode', 'consensus'], shell=False) - + # cactus-hal2maf doesnt support --onlySequenceNames because the genome names are needed by taf_add_gap_bases # so we manually filter here for genome in subprocess.check_output(['halStats', halPath, '--genomes']).strip().decode('utf-8').split(): @@ -747,7 +747,7 @@ def _check_maf_accuracy(self, halPath, delta, dataset="mammals"): def parse_mafcomp_output(xml_path): xml_root = ET.parse(xml_path).getroot() comp_roots = xml_root.findall("homologyTests") - assert len(comp_roots) == 2 + assert len(comp_roots) == 2 first, second = None, None for comp_root in comp_roots: avg_val = float(comp_root.find("aggregateResults").find("all").attrib["average"]) @@ -773,7 +773,7 @@ def _check_valid_hal(self, halPath, expected_tree=None): hal_tree = subprocess.check_output(['halStats', halPath, '--tree']).strip().decode('utf-8') if expected_tree: self.assertEqual(hal_tree, expected_tree) - + def testEvolverLocal(self): """ Check that the output of halStats on a hal file produced by running cactus with --binariesMode local is is reasonable @@ -785,20 +785,20 @@ def testEvolverLocal(self): # check the output #self._check_stats(self._out_hal(name), delta_pct=0.25) #self._check_coverage(self._out_hal(name), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.14)) + self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.13)) def testEvolverUpdateNodeLocal(self): """ Check that the "add a genome to ancestor" modification steps give sane/valid results """ name = "local" self._run_evolver(name) - + self._update_evolver_add_to_node(name, new_leaf='simChimp', new_root='Anc1', step_by_step = True) self._update_evolver_add_to_node(name, new_leaf='simOrang', new_root='mr', step_by_step = False) self._check_valid_hal(self._out_hal(name), expected_tree='((simHuman_chr6:0.144018,(simMouse_chr6:0.084509,simRat_chr6:0.091589,simOrang:1)mr:0.271974,simChimp:1)Anc1:0.020593,(simCow_chr6:0.18908,simDog_chr6:0.16303)Anc2:0.032898)Anc0;') - self._check_maf_accuracy(self._out_hal(name), delta=(0.1,0.14)) + self._check_maf_accuracy(self._out_hal(name), delta=(0.1,0.13)) def testEvolverUpdateBranchLocal(self): """ Check that the "add a genome to branch" modification steps give sane/valid results """ @@ -809,7 +809,7 @@ def testEvolverUpdateBranchLocal(self): self._check_valid_hal(self._out_hal(name), expected_tree='((simHuman_chr6:0.144018,((simMouse_chr6:0.084509,simRat_chr6:0.091589)mr:0.171974,simGorilla:0.2)AncGorilla:0.1)Anc1:0.020593,(simCow_chr6:0.18908,simDog_chr6:0.16303)Anc2:0.032898)Anc0;') - self._check_maf_accuracy(self._out_hal(name), delta=(0.1,0.14)) + self._check_maf_accuracy(self._out_hal(name), delta=(0.1,0.13)) def testEvolverPrepareWDL(self): @@ -820,7 +820,7 @@ def testEvolverPrepareWDL(self): # check the output #self._check_stats(self._out_hal("wdl"), delta_pct=0.25) #self._check_coverage(self._out_hal("wdl"), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal("wdl"), delta=(0.05,0.14)) + self._check_maf_accuracy(self._out_hal("wdl"), delta=(0.05,0.13)) def testEvolverPrepareToil(self): @@ -831,7 +831,7 @@ def testEvolverPrepareToil(self): # check the output #self._check_stats(self._out_hal(name), delta_pct=0.25) #self._check_coverage(self._out_hal(name), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.14)) + self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.13)) def testEvolverDecomposedLocal(self): """ Check that the output of halStats on a hal file produced by running cactus with --binariesMode local is @@ -844,7 +844,7 @@ def testEvolverDecomposedLocal(self): # check the output #self._check_stats(self._out_hal(name), delta_pct=0.25) #self._check_coverage(self._out_hal(name), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.14)) + self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.13)) def testEvolverDecomposedDocker(self): """ Check that the output of halStats on a hal file produced by running cactus with --binariesMode docker is @@ -857,8 +857,8 @@ def testEvolverDecomposedDocker(self): # check the output #self._check_stats(self._out_hal(name), delta_pct=0.25) #self._check_coverage(self._out_hal(name), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.14)) - + self._check_maf_accuracy(self._out_hal(name), delta=(0.05,0.13)) + def testEvolverDocker(self): """ Check that the output of halStats on a hal file produced by running cactus with --binariesMode docker is is reasonable. Note: the local image being tested should be set up via CACTUS_DOCKER_ORG (with tag==latest) @@ -869,7 +869,7 @@ def testEvolverDocker(self): # check the output #self._check_stats(self._out_hal("docker"), delta_pct=0.25) #self._check_coverage(self._out_hal("docker"), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal("docker"), delta=(0.05,0.14)) + self._check_maf_accuracy(self._out_hal("docker"), delta=(0.05,0.13)) def testEvolverInDocker(self): """ Check that the output of halStats on a hal file produced by running cactus in docker @@ -881,8 +881,8 @@ def testEvolverInDocker(self): # check the output #self._check_stats(self._out_hal("docker"), delta_pct=0.25) #self._check_coverage(self._out_hal("docker"), delta_pct=0.20) - self._check_maf_accuracy(self._out_hal("in_docker"), delta=(0.05,0.14)) - + self._check_maf_accuracy(self._out_hal("in_docker"), delta=(0.05,0.13)) + def testEvolverPrepareNoOutgroupDocker(self): # run cactus step by step via the plan made by cactus-prepare @@ -907,7 +907,7 @@ def testEvolverPOALocal(self): """ # use the same logic cactus does to get default config config_path = 'src/cactus/cactus_progressive_config.xml' - + xml_root = ET.parse(config_path).getroot() bar_elem = xml_root.find("bar") bar_elem.attrib["partialOrderAlignment"] = "1" @@ -953,13 +953,13 @@ def testEvolverPrimatesPangenomeLocal(self): # check the output # todo: tune config so that delta can be reduced - self._check_maf_accuracy(self._out_hal("local"), delta=(0.025,0.025), dataset='primates') - + self._check_maf_accuracy(self._out_hal("local"), delta=(0.025,0.025), dataset='primates') + def testYeastPangenomeStepByStepLocal(self): """ Run pangenome pipeline (including contig splitting!) on yeast dataset step by step""" name = "local" self._run_yeast_pangenome_step_by_step(name) - + # check the output self._check_yeast_pangenome(name) @@ -967,7 +967,7 @@ def testYeastPangenomeLocal(self): """ Run pangenome pipeline (including contig splitting!) on yeast dataset using cactus-pangenome """ name = "local" self._run_yeast_pangenome(name) - + # check the output self._check_yeast_pangenome(name, other_ref='DBVPG6044', expect_odgi=True, expect_haplo=True, expect_rgfa=True)