-
Notifications
You must be signed in to change notification settings - Fork 81
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add components for extracting dataset info (#315)
* add dataset comp * add get_dataset_info to workflow * commit * undo changes -- will be addressed in #314 * move get_dataset_info component * remove unnecessary dependencies * add component for extracting the dataset info * fix script * fix typo * fix script * update script * fix get_dataset_info --------- Co-authored-by: Kai Waldrant <[email protected]>
- Loading branch information
1 parent
0a22803
commit 17cc7cf
Showing
12 changed files
with
216 additions
and
22 deletions.
There are no files selected for viewing
20 changes: 20 additions & 0 deletions
20
src/common/process_task_results/get_dataset_info/config.vsh.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
__merge__: ../api/get_info.yaml | ||
functionality: | ||
name: "get_dataset_info" | ||
description: "Extract dataset info and convert to expected format for website results" | ||
resources: | ||
- type: r_script | ||
path: script.R | ||
test_resources: | ||
- type: file | ||
path: /resources_test/common/task_metadata/dataset_info.yaml | ||
dest: test_file.yaml | ||
platforms: | ||
- type: docker | ||
image: ghcr.io/openproblems-bio/base_r:1.0.2 | ||
setup: | ||
- type: r | ||
cran: [ purrr, dplyr, yaml, rlang, processx ] | ||
- type: nextflow | ||
directives: | ||
label: [lowmem, lowtime, lowcpu] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
library(purrr, warn.conflicts = FALSE) | ||
library(dplyr, warn.conflicts = FALSE) | ||
library(rlang, warn.conflicts = FALSE) | ||
|
||
## VIASH START | ||
par <- list( | ||
input = "resources_test/common/task_metadata/dataset_info.yaml", | ||
output = "output/metric_info.json" | ||
) | ||
## VIASH END | ||
|
||
datasets <- yaml::yaml.load_file(par$input) | ||
|
||
df <- map_df(datasets, function(dataset) { | ||
info <- as_tibble(map(dataset, as.data.frame)) | ||
}) %>% | ||
rename( | ||
data_url = dataset_url, | ||
data_reference = dataset_reference | ||
) | ||
|
||
|
||
jsonlite::write_json( | ||
purrr::transpose(df), | ||
par$output, | ||
auto_unbox = TRUE, | ||
pretty = TRUE | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,16 @@ | ||
from os import path | ||
import yaml | ||
import json | ||
|
||
## VIASH START | ||
par = { | ||
"input" : ".", | ||
"task_id" : "denoising", | ||
"input": ".", | ||
"task_id": "denoising", | ||
"output": "output/task.json", | ||
|
||
} | ||
meta = { "functionality" : "foo" } | ||
|
||
## VIASH END | ||
|
||
with open(par["input"], "r") as f: | ||
yaml_file = yaml.safe_load(f) | ||
|
||
|
||
with open(par["output"], "w") as out: | ||
json.dump(yaml_file, out, indent=2) | ||
json.dump(yaml_file, out, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
|
||
cat > "/tmp/params.yaml" << HERE | ||
param_list: | ||
- id: openproblems_v1 | ||
input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml" | ||
rename_keys: 'input:output_dataset' | ||
- id: openproblems_v1_multimodal | ||
input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml" | ||
rename_keys: 'input:output_dataset_mod1' | ||
- id: cellxgene_census | ||
input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml" | ||
rename_keys: 'input:output_dataset' | ||
settings: '{"output": "dataset_info.yaml"}' | ||
output_state: state.yaml | ||
publish_dir: "$DATASETS_DIR" | ||
HERE | ||
|
||
cat > /tmp/nextflow.config << HERE | ||
process { | ||
executor = 'awsbatch' | ||
withLabel: highmem { | ||
memory = '350GB' | ||
} | ||
withName: '.*publishStatesProc' { | ||
memory = '16GB' | ||
disk = '100GB' | ||
} | ||
} | ||
HERE | ||
|
||
tw launch https://github.com/openproblems-bio/openproblems-v2.git \ | ||
--revision main_build \ | ||
--entry-name auto \ | ||
--pull-latest \ | ||
--main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \ | ||
--workspace 53907369739130 \ | ||
--compute-env 1pK56PjjzeraOOC2LDZvN2 \ | ||
--params-file "/tmp/params.yaml" \ | ||
--config /tmp/nextflow.config |
34 changes: 34 additions & 0 deletions
34
src/datasets/workflows/extract_dataset_info/config.vsh.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
functionality: | ||
name: "extract_dataset_info" | ||
namespace: "datasets/workflows" | ||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- name: "--input" | ||
__merge__: /src/datasets/api/file_raw.yaml | ||
required: true | ||
direction: input | ||
- name: Filter arguments | ||
arguments: | ||
- name: "--filter_normalization_id" | ||
type: string | ||
required: false | ||
direction: input | ||
description: If defined, only the normalization with this ID will be included in the output. | ||
multiple: true | ||
default: [ log_cp10k ] | ||
- name: Outputs | ||
arguments: | ||
- name: "--output" | ||
type: file | ||
required: true | ||
direction: output | ||
example: dataset_uns.yaml | ||
resources: | ||
- type: nextflow_script | ||
path: main.nf | ||
entrypoint: run_wf | ||
dependencies: | ||
- name: common/check_dataset_schema | ||
platforms: | ||
- type: nextflow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
workflow auto { | ||
findStates(params, meta.config) | ||
| meta.workflow.run( | ||
auto: [publish: "state"] | ||
) | ||
} | ||
|
||
workflow run_wf { | ||
take: | ||
input_ch | ||
|
||
main: | ||
output_ch = input_ch | ||
|
||
// extract the dataset metadata | ||
| check_dataset_schema.run( | ||
fromState: [input: "input"], | ||
toState: { id, output, state -> | ||
def dataset_uns = (new org.yaml.snakeyaml.Yaml().load(output.meta)).uns | ||
state + [dataset_uns: dataset_uns] | ||
} | ||
) | ||
|
||
// only keep one of the normalization methods | ||
| filter{ id, state -> | ||
if (state.filter_normalization_id) { | ||
state.filter_normalization_id.contains(state.dataset_uns.normalization_id) | ||
} else { | ||
true | ||
} | ||
} | ||
|
||
| joinStates { ids, states -> | ||
// remove normalization id | ||
def dataset_uns = states.collect{state -> | ||
def uns = state.dataset_uns.clone() | ||
uns.remove("normalization_id") | ||
uns | ||
} | ||
|
||
// store data as yaml | ||
def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) | ||
def dataset_uns_file = tempFile("dataset_uns.yaml") | ||
dataset_uns_file.write(dataset_uns_yaml_blob) | ||
|
||
def new_state = [ | ||
output: dataset_uns_file, | ||
_meta: [join_id: ids[0]] | ||
] | ||
["output", new_state] | ||
} | ||
|
||
|
||
emit: | ||
output_ch | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/bin/bash | ||
|
||
# get the root of the directory | ||
REPO_ROOT=$(git rev-parse --show-toplevel) | ||
|
||
# ensure that the command below is run from the root of the repository | ||
cd "$REPO_ROOT" | ||
|
||
set -e | ||
|
||
# export TOWER_WORKSPACE_ID=53907369739130 | ||
|
||
OUTPUT_DIR="output/temp" | ||
|
||
if [ ! -d "$OUTPUT_DIR" ]; then | ||
mkdir -p "$OUTPUT_DIR" | ||
fi | ||
|
||
DATASETS_DIR="resources_test/common" | ||
|
||
export NXF_VER=22.04.5 | ||
nextflow run . \ | ||
-main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \ | ||
-profile docker \ | ||
-resume \ | ||
-c src/wf_utils/labels_ci.config \ | ||
-entry auto \ | ||
--input_states "$DATASETS_DIR/**/state.yaml" \ | ||
--rename_keys 'input:output_dataset' \ | ||
--settings '{"output": "dataset_info.yaml"}' \ | ||
--publish_dir "$OUTPUT_DIR" \ | ||
--output_state "state.yaml" |