From 3a022640119dd2fa65cf23c30ca86f8d7999bd3a Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 7 Sep 2023 20:38:46 -0700 Subject: [PATCH] GBWTGraph to GFA without translation --- deps/gbwtgraph | 2 +- src/subcommand/convert_main.cpp | 31 ++++++++++++++++++++++++------- test/t/48_vg_convert.t | 12 +++++++++++- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/deps/gbwtgraph b/deps/gbwtgraph index eff446aeb2d..5e80b0a0c14 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit eff446aeb2da0fed1ae1bd2ef01e577354c27fca +Subproject commit 5e80b0a0c14aa2e7b1402d982bba7e074b5da95a diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index 710bdb4458a..6b623376901 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -64,7 +64,10 @@ int main_convert(int argc, char** argv) { bool rgfa_pline = false; bool wline = true; algorithm_type gfa_output_algorithm = ALGORITHM_DEFAULT; - int num_threads = omp_get_max_threads(); // For GBWTGraph to GFA. + + // For GBWTGraph to GFA. + int num_threads = omp_get_max_threads(); + bool use_translation = true; if (argc == 2) { help_convert(argv); @@ -74,6 +77,7 @@ int main_convert(int argc, char** argv) { constexpr int OPT_REF_SAMPLE = 1000; constexpr int OPT_GBWTGRAPH_ALGORITHM = 1001; constexpr int OPT_VG_ALGORITHM = 1002; + constexpr int OPT_NO_TRANSLATION = 1003; int c; optind = 2; // force optind past command positional argument @@ -98,6 +102,7 @@ int main_convert(int argc, char** argv) { {"no-wline", no_argument, 0, 'W'}, {"gbwtgraph-algorithm", no_argument, 0, OPT_GBWTGRAPH_ALGORITHM}, {"vg-algorithm", no_argument, 0, OPT_VG_ALGORITHM}, + {"no-translation", no_argument, 0, OPT_NO_TRANSLATION}, {"gam-to-gaf", required_argument, 0, 'G'}, {"gaf-to-gam", required_argument, 0, 'F'}, {"threads", required_argument, 0, 't'}, @@ -173,6 +178,9 @@ int main_convert(int argc, char** argv) { case OPT_VG_ALGORITHM: gfa_output_algorithm = algorithm_vg; break; + case OPT_NO_TRANSLATION: + use_translation = false; + break; case 'G': no_multiple_inputs(input); input = input_gam; @@ -411,6 +419,7 @@ int main_convert(int argc, char** argv) { gbwtgraph::GFAExtractionParameters parameters; parameters.num_threads = num_threads; + parameters.use_translation = use_translation; gbwtgraph::gbwt_to_gfa(*gbwt_graph, std::cout, parameters); } else if (gfa_output_algorithm == algorithm_vg) { // Use HandleGraph GFA conversion code @@ -468,14 +477,22 @@ void help_convert(char** argv) { << " -p, --packed-out output in PackedGraph format [default]" << endl << " -x, --xg-out output in XG format" << endl << " -f, --gfa-out output in GFA format" << endl - << " -H, --drop-haplotypes do not include haplotype paths in the output (useful with GBWTGraph / GBZ inputs)" << endl + << " -H, --drop-haplotypes do not include haplotype paths in the output" << endl + << " (useful with GBWTGraph / GBZ inputs)" << endl << "gfa output options (use with -f):" << endl - << " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl - << " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl + << " -P, --rgfa-path STR write given path as rGFA tags instead of lines" << endl + << " (multiple allowed, only rank-0 supported)" << endl + << " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines" << endl + << " (multiple allowed, only rank-0 supported)" << endl << " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl - << " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl - << " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl - << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl + << " -W, --no-wline Write all paths as GFA P-lines instead of W-lines." << endl + << " Allows handling multiple phase blocks and subranges used together." << endl + << " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm." << endl + << " Not compatible with other GFA output options or non-GBWT graphs." << endl + << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types," << endl + << " but can't preserve original GFA coordinates." << endl + << " --no-translation When using the GBWTGraph algorith, convert the graph directly to GFA." << endl + << " Do not use the translation to preserve original coordinates." << endl << "alignment options:" << endl << " -G, --gam-to-gaf FILE convert GAM FILE to GAF" << endl << " -F, --gaf-to-gam FILE convert GAF FILE to GAM" << endl diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t index 04fbb81803d..44e9e245409 100644 --- a/test/t/48_vg_convert.t +++ b/test/t/48_vg_convert.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 102 +plan tests 106 vg construct -r complex/c.fa -v complex/c.vcf.gz > c.vg cat <(vg view c.vg | grep ^S | sort) <(vg view c.vg | grep L | uniq | wc -l) <(vg paths -v c.vg -E) > c.info @@ -361,6 +361,15 @@ vg convert -f components.gbz | sort > sorted.gfa cmp sorted.gfa correct.gfa is $? 0 "GBZ to GFA conversion works with multiple threads" +# GFA extraction from GBZ with/without translation. +vg gbwt --gbz-format -g chopping.gbz --max-node 2 -G graphs/chopping_walks.gfa +vg convert -f -t 1 chopping.gbz > with-translation.gfa +is $? 0 "GBZ to GFA with translation" +is "$(grep -c "^S" with-translation.gfa)" "8" "8 segments" +vg convert -f -t 1 --no-translation chopping.gbz > no-translation.gfa +is $? 0 "GBZ to GFA without translation" +is "$(grep -c "^S" no-translation.gfa)" "9" "9 segments" + rm -f components.gbwt components.gg components.gbz rm -f direct.hg correct_paths.gaf correct_haplotypes.gaf rm -f components.hg hg_paths.gaf hg_haplotypes.gaf gbz_hg_paths.gaf gbz_hg_haplotypes.gaf @@ -368,6 +377,7 @@ rm -f components.xg xg_paths.gaf xg_haplotypes.gaf gbz_xg_paths.gaf gbz_xg_haplo rm -f no_haplotypes.xg no_haplotypes.hg rm -f extracted.gfa gbz.gfa extracted.hg rm -f sorted.gfa correct.gfa +rm -f chopping.gbz with-translation.gfa no-translation.gfa ##### # Reference path conversion