Fix dataset info components (#316)

* simplify get_dataset_info component * fix script * change default into example * fix script * fix script
openproblems-bio · Dec 19, 2023 · 4b7c085 · 4b7c085
1 parent 17cc7cf
commit 4b7c085
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 16 deletions.
diff --git a/src/common/process_task_results/get_dataset_info/config.vsh.yaml b/src/common/process_task_results/get_dataset_info/config.vsh.yaml
@@ -14,7 +14,7 @@ platforms:
     image: ghcr.io/openproblems-bio/base_r:1.0.2
     setup:
       - type: r
-        cran: [ purrr, dplyr, yaml, rlang, processx ]
+        cran: [ yaml, jsonlite ]
   - type: nextflow
     directives:
       label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/get_dataset_info/script.R b/src/common/process_task_results/get_dataset_info/script.R
@@ -1,27 +1,24 @@
-library(purrr, warn.conflicts = FALSE)
-library(dplyr, warn.conflicts = FALSE)
-library(rlang, warn.conflicts = FALSE)
+requireNamespace("jsonlite", quietly = TRUE)
+requireNamespace("yaml", quietly = TRUE)
 
 ## VIASH START
 par <- list(
   input = "resources_test/common/task_metadata/dataset_info.yaml",
-  output = "output/metric_info.json"
+  output = "output/dataset_info.json"
 )
 ## VIASH END
 
 datasets <- yaml::yaml.load_file(par$input)
 
-df <- map_df(datasets, function(dataset) {
-  info <- as_tibble(map(dataset, as.data.frame))
-}) %>%
-  rename(
-    data_url = dataset_url,
-    data_reference = dataset_reference
-  )
-
+# transform into format expected by website
+datasets_formatted <- lapply(datasets, function(dataset) {
+  dataset$data_url <- dataset$dataset_url
+  dataset$data_reference <- dataset$dataset_reference
+  dataset
+})
 
 jsonlite::write_json(
-  purrr::transpose(df),
+  datasets_formatted,
   par$output,
   auto_unbox = TRUE,
   pretty = TRUE

diff --git a/src/datasets/resource_scripts/dataset_info.sh b/src/datasets/resource_scripts/dataset_info.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+DATASETS_DIR="s3://openproblems-data/resources/datasets"
+
 cat > "/tmp/params.yaml" << HERE
 param_list:
   - id: openproblems_v1
@@ -37,4 +39,16 @@ tw launch https://github.com/openproblems-bio/openproblems-v2.git \
   --workspace 53907369739130 \
   --compute-env 1pK56PjjzeraOOC2LDZvN2 \
   --params-file "/tmp/params.yaml" \
-  --config /tmp/nextflow.config
+  --config /tmp/nextflow.config
+
+
+# # run locally after the above has finished
+# nextflow run . \
+#   -main-script target/nextflow/common/process_task_results/get_dataset_info/main.nf \
+#   -profile docker \
+#   -resume \
+#   --input "$DATASETS_DIR/dataset_info.yaml" \
+#   --task_id "common" \
+#   --output "dataset_info.json" \
+#   --output_state state.yaml \
+#   --publish_dir "../website/documentation/reference/datasets/data/"
diff --git a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml
@@ -16,7 +16,7 @@ functionality:
           direction: input
           description: If defined, only the normalization with this ID will be included in the output.
           multiple: true
-          default: [ log_cp10k ]
+          example: [ log_cp10k ]
     - name: Outputs
       arguments:
         - name: "--output"

diff --git a/src/datasets/workflows/extract_dataset_info/main.nf b/src/datasets/workflows/extract_dataset_info/main.nf
@@ -32,6 +32,7 @@ workflow run_wf {
 
     | joinStates { ids, states ->
       // remove normalization id
+      // TODO: make this optional through a parameter?
       def dataset_uns = states.collect{state ->
         def uns = state.dataset_uns.clone()
         uns.remove("normalization_id")