Fix dataset wf (#300)

* Add nf-tower cli for dataset loader * add mising directive labels for dataset loader * add missing directive labels process datasets * remove space in file name * update s3 bucket * increase yaml limit to 5mb * Fix dataset schema validation and remove unnecessary code to fix meta file size * Update dataset schema file path in config.vsh.yaml and main.nf * Add script for processing datasets on nf-tower in bat_int * Remove dataset_schema input from config.vsh.yaml * Add output_task_info to workflow configuration * Update publish directory in process_datasets.sh for bat_int * Update denoising process_datasets wf Former-commit-id: 82a6a4d
openproblems-bio · Dec 7, 2023 · ce8252d · ce8252d
1 parent 8645e63
commit ce8252d
Show file tree

Hide file tree

Showing 18 changed files with 90 additions and 32 deletions.
diff --git a/src/common/check_dataset_schema/script.py b/src/common/check_dataset_schema/script.py
@@ -73,9 +73,9 @@ def to_dict_of_atomics(obj):
   for key, val in adata.uns.items():
     if is_atomic(val):
       uns[key] = to_atomic(val)
-    elif is_list_of_atomics(val):
+    elif is_list_of_atomics(val) and len(val) <= 10:
       uns[key] = to_list_of_atomics(val)
-    elif is_dict_of_atomics(val):
+    elif is_dict_of_atomics(val) and len(val) <= 10:
       uns[key] = to_dict_of_atomics(val)
   structure = {
     struct: list(getattr(adata, struct).keys())

diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal_nf_tower.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal_nf_tower.sh
@@ -46,7 +46,7 @@ output_dataset_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-publish_dir: s3://openproblems-data/resources/datasets/openproblems_v1_multimodal
+publish_dir: s3://openproblems-nextflow/resources/datasets/openproblems_v1_multimodal
 HERE
 
 tw launch https://github.com/openproblems-bio/openproblems-v2.git \

diff --git a/src/datasets/resource_scripts/openproblems_v1_nf_tower.sh b/src/datasets/resource_scripts/openproblems_v1_nf_tower.sh
@@ -135,7 +135,7 @@ output_normalized: force_null
 output_pca: force_null
 output_hvg: force_null
 output_knn: force_null
-publish_dir: s3://openproblems-data/resources/datasets/openproblems_v1
+publish_dir: s3://openproblems-nextflow/resources/datasets/openproblems_v1
 HERE
 
 cat > /tmp/nextflow.config << HERE

diff --git a/src/tasks/batch_integration/nf_tower_scripts/process_datasets.sh b/src/tasks/batch_integration/nf_tower_scripts/process_datasets.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cat > /tmp/params.yaml << HERE
+id: batch_integration_process_datasets
+input_states: s3://openproblems-nextflow/resources/datasets/openproblems_v1/**/state.yaml
+rename_keys: 'input:output_dataset'
+settings: '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}'
+publish_dir: s3://openproblems-nextflow/resources/batch_integration/datasets/openproblems_v1
+HERE
+
+cat > /tmp/nextflow.config << HERE
+process {
+  executor = 'awsbatch'
+}
+HERE
+
+tw launch https://github.com/openproblems-bio/openproblems-v2.git \
+  --revision main_build \
+  --pull-latest \
+  --main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 7IkB9ckC81O0dgNemcPJTD \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config /tmp/nextflow.config
diff --git a/src/tasks/batch_integration/nf_tower_scripts/run_benchmark.sh b/src/tasks/batch_integration/nf_tower_scripts/run_benchmark.sh
@@ -4,7 +4,7 @@
 # try running on nf tower
 cat > /tmp/params.yaml << HERE
 id: batch_integration
-input_states: s3://openproblems-data/resources/batch_integration/datasets/**/state.yaml
+input_states: s3://openproblems-nextflow/resources/batch_integration/datasets/**/state.yaml
 rename_keys: 'input_dataset:output_dataset,input_solution:output_solution'
 settings: '{"output": "scores.tsv"}'
 publish_dir: s3://openproblems-nextflow/output/v2/batch_integration

diff --git a/src/tasks/batch_integration/nf_tower_scripts/run_test.sh b/src/tasks/batch_integration/nf_tower_scripts/run_test.sh
@@ -5,7 +5,7 @@ DATASET_DIR=resources_test/batch_integration/pancreas
 # try running on nf tower
 cat > /tmp/params.yaml << HERE
 id: batch_integration_test
-input_states: s3://openproblems-data/resources_test/batch_integration/**/state.yaml
+input_states: s3://openproblems-nextflow/resources_test/batch_integration/**/state.yaml
 rename_keys: 'input_dataset:output_dataset,input_solution:output_solution'
 settings: '{"output": "scores.tsv"}'
 publish_dir: s3://openproblems-nextflow/output_test/v2/batch_integration/

diff --git a/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml b/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml
@@ -8,12 +8,6 @@ functionality:
           __merge__: "/src/tasks/batch_integration/api/file_common_dataset.yaml"
           required: true
           direction: input
-        - name: "--dataset_schema"
-          type: "file"
-          description: "The schema of the dataset to validate against"
-          required: true
-          default: "src/tasks/batch_integration/api/file_common_dataset.yaml"
-          direction: input
     - name: Outputs
       arguments:
         - name: "--output_dataset"
@@ -28,6 +22,8 @@ functionality:
     - type: nextflow_script
       path: main.nf
       entrypoint: run_wf
+    - type: file
+      path: "/src/tasks/batch_integration/api/file_common_dataset.yaml"
   dependencies:
     - name: common/check_dataset_schema
     - name: batch_integration/process_dataset

diff --git a/src/tasks/batch_integration/workflows/process_datasets/main.nf b/src/tasks/batch_integration/workflows/process_datasets/main.nf
@@ -15,10 +15,13 @@ workflow run_wf {
     // TODO: check schema based on the values in `config`
     // instead of having to provide a separate schema file
     | check_dataset_schema.run(
-      fromState: [
-        "input": "input",
-        "schema": "dataset_schema"
-      ],
+      fromState: { id, state ->
+        // as a resource
+        [
+          "input": state.input,
+          "schema": meta.resources_dir.resolve("file_common_dataset.yaml")
+        ]
+      },
       args: [
         "stop_on_error": false
       ],

diff --git a/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml b/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml
@@ -35,10 +35,17 @@ functionality:
           required: true
           direction: output
           example: dataset_uns.yaml
+        - name: "--output_task_info"
+          type: file
+          required: true
+          direction: output
+          example: task_info.yaml
   resources:
     - type: nextflow_script
       path: main.nf
       entrypoint: run_wf
+    - type: file
+      path: /src/tasks/batch_integration/api/task_info.yaml
   dependencies: 
     - name: common/check_dataset_schema
     - name: common/extract_scores

diff --git a/src/tasks/batch_integration/workflows/run_benchmark/main.nf b/src/tasks/batch_integration/workflows/run_benchmark/main.nf
@@ -212,16 +212,20 @@ workflow run_wf {
       def metric_configs_file = tempFile("metric_configs.yaml")
       metric_configs_file.write(metric_configs_yaml_blob)
 
+      def task_info_file = meta.resources_dir.resolve("task_info.yaml")
+
       def new_state = [
         output_method_configs: method_configs_file,
         output_metric_configs: metric_configs_file,
+        output_task_info: task_info_file,
         _meta: _meta
       ]
       ["output", new_state]
     }
 
   // merge all of the output data 
   // todo: add task info?
+
   // todo: add trace log?
   output_ch = comp_config_ch
     | mix(metric_uns_ch, dataset_meta_ch)

diff --git a/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh b/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh
@@ -26,6 +26,6 @@ nextflow run . \
   -entry auto \
   --input_states "$DATASETS_DIR/**/state.yaml" \
   --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \
-  --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml"}' \
+  --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \
   --publish_dir "$OUTPUT_DIR" \
   --output_state "state.yaml"
diff --git a/src/tasks/denoising/nf_tower_scripts/process_datasets.sh b/src/tasks/denoising/nf_tower_scripts/process_datasets.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cat > /tmp/params.yaml << HERE
+id: denoising_process_datasets
+input_states: s3://openproblems-nextflow/resources/datasets/openproblems_v1/**/state.yaml
+rename_keys: 'input:output_dataset'
+settings: '{"output_train": "train.h5ad", "output_test": "test.h5ad"}'
+publish_dir: s3://openproblems-nextflow/resources/denoising/datasets/openproblems_v1
+HERE
+
+cat > /tmp/nextflow.config << HERE
+process {
+  executor = 'awsbatch'
+}
+HERE
+
+tw launch https://github.com/openproblems-bio/openproblems-v2.git \
+  --revision main_build \
+  --pull-latest \
+  --main-script target/nextflow/denoising/workflows/process_datasets/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 7IkB9ckC81O0dgNemcPJTD \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config /tmp/nextflow.config
diff --git a/src/tasks/denoising/nf_tower_scripts/run_benchmark.sh b/src/tasks/denoising/nf_tower_scripts/run_benchmark.sh
@@ -5,7 +5,7 @@ DATASET_DIR=resources_test/denoising/pancreas
 # try running on nf tower
 cat > /tmp/params.yaml << HERE
 id: denoising
-input_states: s3://openproblems-data/resources/denoising/datasets/**/*state.yaml
+input_states: s3://openproblems-nextflow/resources/denoising/datasets/**/*state.yaml
 rename_keys: 'input_train:output_train,input_test:output_test'
 settings: '{"output": "scores.tsv"}'
 publish_dir: s3://openproblems-nextflow/output/v2/denoising

diff --git a/src/tasks/denoising/nf_tower_scripts/run_test.sh b/src/tasks/denoising/nf_tower_scripts/run_test.sh
@@ -5,7 +5,7 @@ DATASET_DIR=resources_test/denoising/pancreas
 # try running on nf tower
 cat > /tmp/params.yaml << HERE
 id: denoising_test
-input_states: s3://openproblems-data/resources_test/denoising/pancreas/
+input_states: s3://openproblems-nextflow/resources_test/denoising/pancreas/
 rename_keys: 'input_train:output_train,input_test:output_test'
 settings: '{"output": "scores.tsv"}'
 publish_dir: s3://openproblems-nextflow/output_test/v2/denoising/

diff --git a/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml b/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml
@@ -8,13 +8,6 @@ functionality:
           required: true
           example: dataset.h5ad
           __merge__: "/src/tasks/denoising/api/file_common_dataset.yaml"
-    - name: Schemas
-      arguments:
-        - name: "--dataset_schema"
-          type: "file"
-          description: "The schema of the dataset to validate against"
-          required: true
-          default: "src/tasks/denoising/api/file_common_dataset.yaml"
     - name: Outputs
       arguments:
         - name: "--output_train"
@@ -29,6 +22,8 @@ functionality:
     - type: nextflow_script
       path: main.nf
       entrypoint: run_wf
+    - type: file
+      path: "/src/tasks/denoising/api/file_common_dataset.yaml"
   dependencies:
     - name: common/check_dataset_schema
     - name: denoising/process_dataset

diff --git a/src/tasks/denoising/workflows/process_datasets/main.nf b/src/tasks/denoising/workflows/process_datasets/main.nf
@@ -15,10 +15,13 @@ workflow run_wf {
     // TODO: check schema based on the values in `config`
     // instead of having to provide a separate schema file
     | check_dataset_schema.run(
-      fromState: [
-        "input": "input",
-        "schema": "dataset_schema"
-      ],
+      fromState: { id, state ->
+        // as a resource
+        [
+          "input": state.input,
+          "schema": meta.resources_dir.resolve("file_common_dataset.yaml")
+        ]
+      },
       args: [
         "stop_on_error": false
       ],

diff --git a/src/tasks/dimensionality_reduction/nf_tower_scripts/run_benchmark.sh b/src/tasks/dimensionality_reduction/nf_tower_scripts/run_benchmark.sh
@@ -4,7 +4,7 @@
 # try running on nf tower
 cat > /tmp/params.yaml << HERE
 id: dimensionality_reduction
-input_states: s3://openproblems-data/resources/dimensionality_reduction/datasets
+input_states: s3://openproblems-nextflow/resources/dimensionality_reduction/datasets
 rename_keys: 'input_dataset:output_dataset,input_solution:output_solution'
 settings: '{"output": "scores.tsv"}'
 publish_dir: s3://openproblems-nextflow/output/v2/dimensionality_reduction

diff --git a/src/tasks/dimensionality_reduction/nf_tower_scripts/run_test.sh b/src/tasks/dimensionality_reduction/nf_tower_scripts/run_test.sh
@@ -4,7 +4,7 @@
 # try running on nf tower
 cat > /tmp/params.yaml << HERE
 id: dimensionality_reduction
-input_states: s3://openproblems-data/resources_test/dimensionality_reduction/pancreas
+input_states: s3://openproblems-nextflow/resources_test/dimensionality_reduction/pancreas
 rename_keys: 'input_dataset:output_dataset,input_solution:output_solution'
 settings: '{"output": "scores.tsv"}'
 publish_dir: s3://openproblems-nextflow/output_test/v2/dimensionality_reduction