Add components for extracting dataset info (#315)

* add dataset comp * add get_dataset_info to workflow * commit * undo changes -- will be addressed in #314 * move get_dataset_info component * remove unnecessary dependencies * add component for extracting the dataset info * fix script * fix typo * fix script * update script * fix get_dataset_info --------- Co-authored-by: Kai Waldrant <[email protected]>
openproblems-bio · Dec 19, 2023 · 17cc7cf · 17cc7cf
1 parent 0a22803
commit 17cc7cf
Show file tree

Hide file tree

Showing 12 changed files with 216 additions and 22 deletions.
diff --git a/src/common/process_task_results/get_dataset_info/config.vsh.yaml b/src/common/process_task_results/get_dataset_info/config.vsh.yaml
@@ -0,0 +1,20 @@
+__merge__: ../api/get_info.yaml
+functionality:
+  name: "get_dataset_info"
+  description: "Extract dataset info and convert to expected format for website results"
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: file
+      path: /resources_test/common/task_metadata/dataset_info.yaml
+      dest: test_file.yaml
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.2
+    setup:
+      - type: r
+        cran: [ purrr, dplyr, yaml, rlang, processx ]
+  - type: nextflow
+    directives:
+      label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/get_dataset_info/script.R b/src/common/process_task_results/get_dataset_info/script.R
@@ -0,0 +1,28 @@
+library(purrr, warn.conflicts = FALSE)
+library(dplyr, warn.conflicts = FALSE)
+library(rlang, warn.conflicts = FALSE)
+
+## VIASH START
+par <- list(
+  input = "resources_test/common/task_metadata/dataset_info.yaml",
+  output = "output/metric_info.json"
+)
+## VIASH END
+
+datasets <- yaml::yaml.load_file(par$input)
+
+df <- map_df(datasets, function(dataset) {
+  info <- as_tibble(map(dataset, as.data.frame))
+}) %>%
+  rename(
+    data_url = dataset_url,
+    data_reference = dataset_reference
+  )
+
+
+jsonlite::write_json(
+  purrr::transpose(df),
+  par$output,
+  auto_unbox = TRUE,
+  pretty = TRUE
+)
diff --git a/src/common/process_task_results/get_method_info/config.vsh.yaml b/src/common/process_task_results/get_method_info/config.vsh.yaml
@@ -15,10 +15,6 @@ platforms:
     setup:
       - type: r
         cran: [ purrr, dplyr, yaml, rlang, processx ]
-      - type: apt
-        packages: [ curl, default-jdk ]
-      - type: docker
-        run: "curl -fsSL dl.viash.io | bash && mv viash /usr/bin/viash"
   - type: nextflow
     directives:
       label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/get_metric_info/config.vsh.yaml b/src/common/process_task_results/get_metric_info/config.vsh.yaml
@@ -15,10 +15,6 @@ platforms:
     setup:
       - type: r
         cran: [ purrr, dplyr, yaml, rlang, processx ]
-      - type: apt
-        packages: [ curl, default-jdk ]
-      - type: docker
-        run: "curl -fsSL dl.viash.io | bash && mv viash /usr/bin/viash"
   - type: nextflow
     directives:
       label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/run/config.vsh.yaml b/src/common/process_task_results/run/config.vsh.yaml
@@ -79,6 +79,7 @@ functionality:
     - name: common/process_task_results/get_results
     - name: common/process_task_results/get_method_info
     - name: common/process_task_results/get_metric_info
+    - name: common/process_task_results/get_dataset_info
     - name: common/process_task_results/yaml_to_json
 platforms:
   - type: nextflow
diff --git a/src/common/process_task_results/run/main.nf b/src/common/process_task_results/run/main.nf
@@ -34,8 +34,7 @@ workflow run_wf {
       }
     )
 
-    | yaml_to_json.run(
-      key: "dataset_info",
+    | get_dataset_info.run(
       fromState: [ 
         "input": "input_dataset_info",
         "output": "output_dataset_info"

diff --git a/src/common/process_task_results/yaml_to_json/script.py b/src/common/process_task_results/yaml_to_json/script.py
@@ -1,21 +1,16 @@
-from os import path
 import yaml
 import json
 
 ## VIASH START
 par = {
-    "input" : ".",
-    "task_id" : "denoising",
+    "input": ".",
+    "task_id": "denoising",
     "output": "output/task.json",
-
 }
-meta = { "functionality" : "foo" }
-
 ## VIASH END
 
 with open(par["input"], "r") as f:
     yaml_file = yaml.safe_load(f)
 
-
 with open(par["output"], "w") as out:
-    json.dump(yaml_file, out, indent=2)
+    json.dump(yaml_file, out, indent=2)
diff --git a/src/common/resources_test_scripts/task_metadata.sh b/src/common/resources_test_scripts/task_metadata.sh
@@ -128,9 +128,6 @@ nextflow run . \
   -entry auto \
   --input_states "$DATASETS_DIR/**/state.yaml" \
   --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \
-  --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml"}' \
+  --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \
   --publish_dir "$OUTPUT_DIR" \
   --output_state "state.yaml"
-
-# Copy task info
-cp src/tasks/batch_integration/api/task_info.yaml "$OUTPUT_DIR/task_info.yaml"
diff --git a/src/datasets/resource_scripts/dataset_info.sh b/src/datasets/resource_scripts/dataset_info.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+cat > "/tmp/params.yaml" << HERE
+param_list:
+  - id: openproblems_v1
+    input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml"
+    rename_keys: 'input:output_dataset'
+  - id: openproblems_v1_multimodal
+    input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml"
+    rename_keys: 'input:output_dataset_mod1'
+  - id: cellxgene_census
+    input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml"
+    rename_keys: 'input:output_dataset'
+settings: '{"output": "dataset_info.yaml"}'
+output_state: state.yaml
+publish_dir: "$DATASETS_DIR"
+HERE
+
+cat > /tmp/nextflow.config << HERE
+process {
+  executor = 'awsbatch'
+  withLabel: highmem {
+    memory = '350GB'
+  }
+  withName: '.*publishStatesProc' {
+    memory = '16GB'
+    disk = '100GB'
+  }
+}
+HERE
+
+tw launch https://github.com/openproblems-bio/openproblems-v2.git \
+  --revision main_build \
+  --entry-name auto \
+  --pull-latest \
+  --main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 1pK56PjjzeraOOC2LDZvN2 \
+  --params-file "/tmp/params.yaml" \
+  --config /tmp/nextflow.config
diff --git a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml
@@ -0,0 +1,34 @@
+functionality:
+  name: "extract_dataset_info"
+  namespace: "datasets/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--input"
+          __merge__: /src/datasets/api/file_raw.yaml
+          required: true
+          direction: input
+    - name: Filter arguments
+      arguments:
+        - name: "--filter_normalization_id"
+          type: string
+          required: false
+          direction: input
+          description: If defined, only the normalization with this ID will be included in the output.
+          multiple: true
+          default: [ log_cp10k ]
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          type: file
+          required: true
+          direction: output
+          example: dataset_uns.yaml
+  resources:
+    - type: nextflow_script
+      path: main.nf
+      entrypoint: run_wf
+  dependencies: 
+    - name: common/check_dataset_schema
+platforms:
+  - type: nextflow
diff --git a/src/datasets/workflows/extract_dataset_info/main.nf b/src/datasets/workflows/extract_dataset_info/main.nf
@@ -0,0 +1,56 @@
+workflow auto {
+  findStates(params, meta.config)
+    | meta.workflow.run(
+      auto: [publish: "state"]
+    )
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // extract the dataset metadata
+    | check_dataset_schema.run(
+      fromState: [input: "input"],
+      toState: { id, output, state ->
+        def dataset_uns = (new org.yaml.snakeyaml.Yaml().load(output.meta)).uns
+        state + [dataset_uns: dataset_uns]
+      }
+    )
+
+    // only keep one of the normalization methods
+    | filter{ id, state ->
+      if (state.filter_normalization_id) {
+        state.filter_normalization_id.contains(state.dataset_uns.normalization_id)
+      } else {
+        true
+      }
+    }
+
+    | joinStates { ids, states ->
+      // remove normalization id
+      def dataset_uns = states.collect{state ->
+        def uns = state.dataset_uns.clone()
+        uns.remove("normalization_id")
+        uns
+      }
+
+      // store data as yaml
+      def dataset_uns_yaml_blob = toYamlBlob(dataset_uns)
+      def dataset_uns_file = tempFile("dataset_uns.yaml")
+      dataset_uns_file.write(dataset_uns_yaml_blob)
+
+      def new_state = [
+        output: dataset_uns_file, 
+        _meta: [join_id: ids[0]]
+      ]
+      ["output", new_state]
+    }
+
+
+  emit:
+  output_ch
+}
diff --git a/src/datasets/workflows/extract_dataset_info/run_test.sh b/src/datasets/workflows/extract_dataset_info/run_test.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+# export TOWER_WORKSPACE_ID=53907369739130
+
+OUTPUT_DIR="output/temp"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+DATASETS_DIR="resources_test/common"
+
+export NXF_VER=22.04.5
+nextflow run . \
+  -main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \
+  -profile docker \
+  -resume \
+  -c src/wf_utils/labels_ci.config \
+  -entry auto \
+  --input_states "$DATASETS_DIR/**/state.yaml" \
+  --rename_keys 'input:output_dataset' \
+  --settings '{"output": "dataset_info.yaml"}' \
+  --publish_dir "$OUTPUT_DIR" \
+  --output_state "state.yaml"