diff --git a/.Rbuildignore b/.Rbuildignore index e91063e..2642258 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,9 @@ ^\.Rproj\.user$ ^\.idea$ ^\.github$ +_pkgdown\.yml +compare_versions +deploy.sh +docs +extras +man-roxygen diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index 9a5fe40..96923df 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -20,9 +20,6 @@ jobs: fail-fast: false matrix: config: - - {os: windows-latest, r: '4.2.3', rtools: '42', rspm: "https://cloud.r-project.org"} - - {os: macOS-latest, r: '4.2.3', rtools: '42', rspm: "https://cloud.r-project.org"} - - {os: ubuntu-20.04, r: '4.2.3', rtools: '42', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - {os: windows-latest, r: 'release', rtools: '', rspm: "https://cloud.r-project.org"} - {os: macOS-latest, r: 'release', rtools: '', rspm: "https://cloud.r-project.org"} - {os: ubuntu-20.04, r: 'release', rtools: '', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} @@ -50,9 +47,23 @@ jobs: CDM5_REDSHIFT_PASSWORD: ${{ secrets.CDM5_REDSHIFT_PASSWORD }} CDM5_REDSHIFT_SERVER: ${{ secrets.CDM5_REDSHIFT_SERVER }} CDM5_REDSHIFT_USER: ${{ secrets.CDM5_REDSHIFT_USER }} + CDM_SNOWFLAKE_CDM53_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_CDM53_SCHEMA }} + CDM_SNOWFLAKE_OHDSI_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_OHDSI_SCHEMA }} + CDM_SNOWFLAKE_PASSWORD: ${{ secrets.CDM_SNOWFLAKE_PASSWORD }} + CDM_SNOWFLAKE_CONNECTION_STRING: ${{ secrets.CDM_SNOWFLAKE_CONNECTION_STRING }} + CDM_SNOWFLAKE_USER: ${{ secrets.CDM_SNOWFLAKE_USER }} + CDM5_SPARK_USER: ${{ secrets.CDM5_SPARK_USER }} + CDM5_SPARK_PASSWORD: ${{ secrets.CDM5_SPARK_PASSWORD }} + CDM5_SPARK_CONNECTION_STRING: ${{ secrets.CDM5_SPARK_CONNECTION_STRING }} + CDM5_SPARK_CDM_SCHEMA: ${{ secrets.CDM5_SPARK_CDM_SCHEMA }} + CDM5_SPARK_OHDSI_SCHEMA: ${{ secrets.CDM5_SPARK_OHDSI_SCHEMA }} + CDM_BIG_QUERY_CONNECTION_STRING: ${{ secrets.CDM_BIG_QUERY_CONNECTION_STRING }} + CDM_BIG_QUERY_KEY_FILE: ${{ secrets.CDM_BIG_QUERY_KEY_FILE }} + CDM_BIG_QUERY_CDM_SCHEMA: ${{ secrets.CDM_BIG_QUERY_CDM_SCHEMA }} + CDM_BIG_QUERY_OHDSI_SCHEMA: ${{ secrets.CDM_BIG_QUERY_OHDSI_SCHEMA }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -91,10 +102,13 @@ jobs: eval sudo $cmd done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - - name: Reconfigure Java + - name: Setup Java if: runner.os == 'macOS' - run: R CMD javareconf - + uses: actions/setup-java@v4 + with: + distribution: 'corretto' + java-version: '8' + - name: Install libssh if: runner.os == 'Linux' run: | @@ -102,6 +116,7 @@ jobs: - name: Install dependencies run: | + install.packages("cachem") remotes::install_deps(dependencies = TRUE, INSTALL_opts=c("--no-multiarch")) remotes::install_cran("rcmdcheck") shell: Rscript {0} diff --git a/.github/workflows/R_CMD_check_main_weekly.yaml b/.github/workflows/R_CMD_check_main_weekly.yaml index ecf43ff..30c8dea 100644 --- a/.github/workflows/R_CMD_check_main_weekly.yaml +++ b/.github/workflows/R_CMD_check_main_weekly.yaml @@ -20,30 +20,41 @@ jobs: GITHUB_PAT: ${{ secrets.GH_TOKEN }} R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} - CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM_SCHEMA }} + CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM54_SCHEMA }} CDM5_ORACLE_OHDSI_SCHEMA: ${{ secrets.CDM5_ORACLE_OHDSI_SCHEMA }} CDM5_ORACLE_PASSWORD: ${{ secrets.CDM5_ORACLE_PASSWORD }} CDM5_ORACLE_SERVER: ${{ secrets.CDM5_ORACLE_SERVER }} CDM5_ORACLE_USER: ${{ secrets.CDM5_ORACLE_USER }} - CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM_SCHEMA }} + CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM54_SCHEMA }} CDM5_POSTGRESQL_OHDSI_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_OHDSI_SCHEMA }} CDM5_POSTGRESQL_PASSWORD: ${{ secrets.CDM5_POSTGRESQL_PASSWORD }} CDM5_POSTGRESQL_SERVER: ${{ secrets.CDM5_POSTGRESQL_SERVER }} CDM5_POSTGRESQL_USER: ${{ secrets.CDM5_POSTGRESQL_USER }} - CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM_SCHEMA }} + CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM54_SCHEMA }} CDM5_SQL_SERVER_OHDSI_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_OHDSI_SCHEMA }} CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }} CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }} CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }} - CDM5_REDSHIFT_CDM_SCHEMA: ${{ secrets.CDM5_REDSHIFT_CDM_SCHEMA }} + CDM5_REDSHIFT_CDM_SCHEMA: ${{ secrets.CDM5_REDSHIFT_CDM54_SCHEMA }} CDM5_REDSHIFT_OHDSI_SCHEMA: ${{ secrets.CDM5_REDSHIFT_OHDSI_SCHEMA }} CDM5_REDSHIFT_PASSWORD: ${{ secrets.CDM5_REDSHIFT_PASSWORD }} CDM5_REDSHIFT_SERVER: ${{ secrets.CDM5_REDSHIFT_SERVER }} CDM5_REDSHIFT_USER: ${{ secrets.CDM5_REDSHIFT_USER }} + CDM_SNOWFLAKE_CDM53_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_CDM53_SCHEMA }} + CDM_SNOWFLAKE_OHDSI_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_OHDSI_SCHEMA }} + CDM_SNOWFLAKE_PASSWORD: ${{ secrets.CDM_SNOWFLAKE_PASSWORD }} + CDM_SNOWFLAKE_CONNECTION_STRING: ${{ secrets.CDM_SNOWFLAKE_CONNECTION_STRING }} + CDM_SNOWFLAKE_USER: ${{ secrets.CDM_SNOWFLAKE_USER }} CDM5_SPARK_USER: ${{ secrets.CDM5_SPARK_USER }} CDM5_SPARK_PASSWORD: ${{ secrets.CDM5_SPARK_PASSWORD }} CDM5_SPARK_CONNECTION_STRING: ${{ secrets.CDM5_SPARK_CONNECTION_STRING }} - + CDM5_SPARK_CDM_SCHEMA: ${{ secrets.CDM5_SPARK_CDM_SCHEMA }} + CDM5_SPARK_OHDSI_SCHEMA: ${{ secrets.CDM5_SPARK_OHDSI_SCHEMA }} + CDM_BIG_QUERY_CONNECTION_STRING: ${{ secrets.CDM_BIG_QUERY_CONNECTION_STRING }} + CDM_BIG_QUERY_KEY_FILE: ${{ secrets.CDM_BIG_QUERY_KEY_FILE }} + CDM_BIG_QUERY_CDM_SCHEMA: ${{ secrets.CDM_BIG_QUERY_CDM_SCHEMA }} + CDM_BIG_QUERY_OHDSI_SCHEMA: ${{ secrets.CDM_BIG_QUERY_OHDSI_SCHEMA }} + steps: - uses: actions/checkout@v2 diff --git a/DESCRIPTION b/DESCRIPTION index dd768aa..7882e2c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: CohortGenerator Type: Package Title: An R Package for Cohort Generation Against the OMOP CDM -Version: 0.9.0 -Date: 2024-05-28 +Version: 0.10.0 +Date: 2024-07-14 Authors@R: c( person("Anthony", "Sena", email = "sena@ohdsi.org", role = c("aut", "cre")), person("Jamie", "Gilbert", role = c("aut")), @@ -22,13 +22,16 @@ Imports: digest, dplyr, lubridate, + methods, ParallelLogger (>= 3.0.0), readr (>= 2.1.0), rlang, RJSONIO, jsonlite, + ResultModelManager, SqlRender (>= 1.11.1), - stringi (>= 1.7.6) + stringi (>= 1.7.6), + tibble Suggests: CirceR (>= 1.1.1), Eunomia, @@ -36,11 +39,11 @@ Suggests: rmarkdown, ROhdsiWebApi, testthat, - withr + withr, + zip Remotes: - ohdsi/CirceR, - ohdsi/Eunomia, - ohdsi/ROhdsiWebApi + ohdsi/ResultModelManager, + ohdsi/ROhdsiWebApi, License: Apache License VignetteBuilder: knitr URL: https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator diff --git a/NAMESPACE b/NAMESPACE index becbe95..62124bb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(createDemographicSubset) export(createEmptyCohortDefinitionSet) export(createEmptyNegativeControlOutcomeCohortSet) export(createLimitSubset) +export(createResultsDataModel) export(createSubsetCohortWindow) export(dropCohortStatsTables) export(exportCohortStatsTables) @@ -23,9 +24,12 @@ export(generateCohortSet) export(generateNegativeControlOutcomeCohorts) export(getCohortCounts) export(getCohortDefinitionSet) +export(getCohortInclusionRules) export(getCohortStats) export(getCohortTableNames) +export(getDataMigrator) export(getRequiredTasks) +export(getResultsDataModelSpecifications) export(getSubsetDefinitions) export(insertInclusionRuleNames) export(isCamelCase) @@ -33,17 +37,21 @@ export(isCohortDefinitionSet) export(isFormattedForDatabaseUpload) export(isSnakeCase) export(isTaskRequired) +export(migrateDataModel) export(readCsv) export(recordTasksDone) +export(runCohortGeneration) export(sampleCohortDefinitionSet) export(saveCohortDefinitionSet) export(saveCohortSubsetDefinition) export(saveIncremental) +export(uploadResults) export(writeCsv) import(DatabaseConnector) import(R6) -importFrom(dplyr,"%>%") +import(dplyr) importFrom(grDevices,rgb) +importFrom(methods,is) importFrom(rlang,':=') importFrom(rlang,.data) importFrom(stats,aggregate) diff --git a/NEWS.md b/NEWS.md index d68e06c..d81818c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +CohortGenerator 0.10.0 +======================= +New Features +- Add `runCohortGeneration` function (Issue #165) +- Adopt ResultModelManager for handling results data models & uploading. Extend results data model to include information on cohort subsets(#154, #162) +- Remove REMOTES entries for CirceR and Eunomia which are now in CRAN (#145) +- Unit tests now running on all OHDSI DB Platforms (#151) + +Bug Fixes +- Negation of cohort subset operator must join on `subject_id` AND `start_date` (#167) +- Allow integer as cohort ID (#146) +- Use native messaging functions for output vs. ParallelLogger (#97) +- Prevent upload of inclusion rule information (#78) +- Expose `colTypes` when working with .csv files (#59) +- Remove `bit64` from package (mostly) (#152) +- Updated documentation for cohort subset negate feature (#111) + CohortGenerator 0.9.0 ======================= - Random sample functionality (for development only) (Issue #129) diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R index 2dfb86a..e27d222 100644 --- a/R/CohortConstruction.R +++ b/R/CohortConstruction.R @@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL, "sql" ) ) + assertLargeInteger(cohortDefinitionSet$cohortId) # Verify that cohort IDs are not repeated in the cohort definition # set before generating if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) { @@ -249,7 +250,7 @@ generateCohort <- function(cohortId = NULL, connection <- DatabaseConnector::connect(connectionDetails) on.exit(DatabaseConnector::disconnect(connection)) } - ParallelLogger::logInfo(i, "/", nrow(cohortDefinitionSet), "- Generating cohort: ", cohortName, " (id = ", cohortId, ")") + rlang::inform(paste0(i, "/", nrow(cohortDefinitionSet), "- Generating cohort: ", cohortName, " (id = ", cohortId, ")")) sql <- cohortDefinitionSet$sql[i] if (!isSubset) { diff --git a/R/CohortCount.R b/R/CohortCount.R index f14244f..7adad6c 100644 --- a/R/CohortCount.R +++ b/R/CohortCount.R @@ -67,7 +67,7 @@ getCohortCounts <- function(connectionDetails = NULL, if (tolower(cohortTable) %in% tablesInServer) { counts <- DatabaseConnector::querySql(connection, sql, snakeCaseToCamelCase = TRUE) delta <- Sys.time() - start - ParallelLogger::logInfo(paste("Counting cohorts took", signif(delta, 3), attr(delta, "units"))) + rlang::inform(paste("Counting cohorts took", signif(delta, 3), attr(delta, "units"))) if (!is.null(cohortDefinitionSet)) { # If the user has NOT specified a list of cohortIds # to use to filter the cohortDefinitionSet, then @@ -77,15 +77,17 @@ getCohortCounts <- function(connectionDetails = NULL, } counts <- merge( x = counts, - y = cohortDefinitionSet[cohortDefinitionSet$cohortId %in% cohortIds, ], + y = cohortDefinitionSet[cohortDefinitionSet$cohortId %in% cohortIds, , drop = FALSE], by = "cohortId", all.y = TRUE ) - counts <- transform( - counts, - cohortEntries = ifelse(is.na(cohortEntries), 0L, cohortEntries), - cohortSubjects = ifelse(is.na(cohortSubjects), 0L, cohortSubjects) - ) + counts <- with(counts, { + transform( + counts, + cohortEntries = ifelse(is.na(cohortEntries), 0L, cohortEntries), + cohortSubjects = ifelse(is.na(cohortSubjects), 0L, cohortSubjects) + ) + }) } if (!is.null(databaseId)) { counts$databaseId <- databaseId diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R index 7b09547..4404951 100644 --- a/R/CohortDefinitionSet.R +++ b/R/CohortDefinitionSet.R @@ -29,15 +29,41 @@ #' @export createEmptyCohortDefinitionSet <- function(verbose = FALSE) { checkmate::assert_logical(verbose) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() + df <- data.frame( + cohortId = numeric(), + cohortName = character(), + sql = character(), + json = character() + ) if (verbose) { - print(cohortDefinitionSetSpec) + print(df) } - # Build the data.frame dynamically from the cohort definition set spec - df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec) invisible(df) } +.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) { + checkmate::assert_data_frame(x) + df <- createEmptyCohortDefinitionSet(verbose = FALSE) + + # Compare the column names from the input x to an empty cohort + # definition set to ensure the required columns are present + cohortDefinitionSetColumns <- colnames(df) + matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns) + columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns) + + if (!columnNamesMatch && emitWarning) { + columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x)) + warningMessage <- paste0( + "The following columns were missing in your cohortDefinitionSet: ", + paste(columnsMissing, collapse = ","), + ". A cohortDefinitionSet requires the following columns: ", + paste(cohortDefinitionSetColumns, collapse = ",") + ) + warning(warningMessage) + } + invisible(columnNamesMatch) +} + #' Is the data.frame a cohort definition set? #' #' @description @@ -99,7 +125,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit checkmate::assert_data_frame(x) df <- createEmptyCohortDefinitionSet(verbose = FALSE) cohortDefinitionSetColumns <- colnames(df) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning) if (!columnNamesMatch) { @@ -107,7 +132,8 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit } # Compare the data types from the input x to an empty cohort - # definition set to ensure the same data types are present + # definition set to ensure the same data types (or close enough) + # are present dataTypesMatch <- FALSE # Subset x to the required columns xSubset <- x[, cohortDefinitionSetColumns] @@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit # Get the reference data types cohortDefinitionSetDataTypes <- sapply(df, typeof) # Check if the data types match - dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes) + # NOTE: createEmptyCohortDefinitionSet() is the reference for the data + # types. cohortId is declared as a numeric but an integer is also fine + dataTypesMatch <- (xDataTypes[1] %in% c("integer", "double") && all(xDataTypes[2:4] == "character")) + # Create the cohortDefinitionSetSpec from the names/data types for reference + cohortDefinitionSetSpec <- data.frame( + columnName = names(xDataTypes), + dataType = xDataTypes + ) if (!dataTypesMatch && emitWarning) { dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes) # Create a column for the warning message @@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit )) } -.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) { - checkmate::assert_data_frame(x) - df <- createEmptyCohortDefinitionSet(verbose = FALSE) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() - - # Compare the column names from the input x to an empty cohort - # definition set to ensure the required columns are present - cohortDefinitionSetColumns <- colnames(df) - matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns) - columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns) - - if (!columnNamesMatch && emitWarning) { - columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x)) - warningMessage <- paste0( - "The following columns were missing in your cohortDefinitionSet: ", - paste(columnsMissing, collapse = ","), - ". A cohortDefinitionSet requires the following columns: ", - paste(cohortDefinitionSetColumns, collapse = ",") - ) - warning(warningMessage) - } - invisible(columnNamesMatch) -} - -#' Helper function to return the specification description of a -#' cohortDefinitionSet -#' -#' @description -#' This function reads from the cohortDefinitionSetSpecificationDescription.csv -#' to return a data.frame that describes the required columns in a -#' cohortDefinitionSet -#' -#' @return -#' Returns a data.frame that defines a cohortDefinitionSet -#' -#' @noRd -#' @keywords internal -.getCohortDefinitionSetSpecification <- function() { - return(readCsv(system.file("cohortDefinitionSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE - ))) -} - #' Get a cohort definition set #' #' @description @@ -244,7 +233,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", path <- system.file(fileName, package = packageName) } if (verbose) { - ParallelLogger::logInfo(paste0(" -- Loading ", basename(fileName), " from ", path)) + rlang::inform(paste0(" -- Loading ", basename(fileName), " from ", path)) } if (!file.exists(path)) { if (grepl(".json$", tolower(basename(fileName))) && warnOnMissingJson) { @@ -259,10 +248,10 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", } # Read the settings file which holds the cohortDefinitionSet - ParallelLogger::logInfo("Loading cohortDefinitionSet") + rlang::inform("Loading cohortDefinitionSet") settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE) - assert_settings_columns(names(settings), getPath(fileName = settingsFileName)) + assertSettingsColumns(names(settings), getPath(fileName = settingsFileName)) checkmate::assert_true(all(cohortFileNameValue %in% names(settings))) checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings)))) @@ -313,12 +302,12 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", # Loading cohort subset definitions with their associated targets if (loadSubsets & nrow(subsetsToLoad) > 0) { if (dir.exists(subsetJsonFolder)) { - ParallelLogger::logInfo("Loading Cohort Subset Definitions") + rlang::inform("Loading Cohort Subset Definitions") ## Loading subsets that apply to the saved definition sets for (i in unique(subsetsToLoad$subsetDefinitionId)) { subsetFile <- file.path(subsetJsonFolder, paste0(i, ".json")) - ParallelLogger::logInfo("Loading Cohort Subset Defintion ", subsetFile) + rlang::inform(paste0("Loading Cohort Subset Defintion ", subsetFile)) subsetDef <- CohortSubsetDefinition$new(ParallelLogger::loadSettingsFromJson(subsetFile)) # Find target cohorts for this subset definition subsetTargetIds <- unique(subsetsToLoad[subsetsToLoad$subsetDefinitionId == i, ]$subsetParent) @@ -382,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") checkmate::assert_vector(cohortFileNameValue) checkmate::assert_true(length(cohortFileNameValue) > 0) - assert_settings_columns(names(cohortDefinitionSet)) + assertSettingsColumns(names(cohortDefinitionSet)) checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet))) settingsFolder <- dirname(settingsFileName) if (!dir.exists(settingsFolder)) { @@ -397,7 +386,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, # Export the cohortDefinitionSet to the settings folder if (verbose) { - ParallelLogger::logInfo("Exporting cohortDefinitionSet to ", settingsFileName) + rlang::inform(paste0("Exporting cohortDefinitionSet to ", settingsFileName)) } # Write the settings file and ensure that the "sql" and "json" columns are # not included @@ -425,7 +414,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, } if (verbose) { - ParallelLogger::logInfo("Exporting (", i, "/", nrow(cohortDefinitionSet), "): ", cohortName) + rlang::inform(paste0("Exporting (", i, "/", nrow(cohortDefinitionSet), "): ", cohortName)) } if (!is.na(json) && nchar(json) > 0) { @@ -441,7 +430,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, } } - ParallelLogger::logInfo("Cohort definition saved") + rlang::inform("Cohort definition saved") } .getSettingsFileRequiredColumns <- function() { @@ -507,19 +496,37 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) { } } -.createEmptyDataFrameFromSpecification <- function(specifications) { - # Build the data.frame dynamically from the cohort definition set spec - df <- data.frame() - for (i in 1:nrow(specifications)) { - colName <- specifications$columnName[i] - dataType <- specifications$dataType[i] - if (dataType == "integer64") { - df <- df %>% dplyr::mutate(!!colName := do.call(what = bit64::as.integer64, args = list())) - } else { - df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list())) - } +#' Custom checkmate assertion for ensuring a vector contains only integer numbers, +#' including large ones +#' +#' @description +#' This function is used to provide a more informative message to inform +#' a user that their number must be an integer. Since the +#' cohort definition set allows for storing `numeric` data types, we need +#' to make sure that there are no digits in the mantissa of the cohort ID. +#' NOTE: This function is necessary since checkmate::assert_integerish +#' will still throw an error even in the case where you have a large +#' integer which was not desirable. +#' +#' @param x The vector containing integer/numeric values +#' +#' @param columnName The name of the column where this vector came from. This +#' is used when displaying the error message. +#' @return +#' Returns TRUE if all the values in x are integers +#' @noRd +#' @keywords internal +checkLargeInteger <- function(x, columnName = "cohortId") { + # NOTE: suppressWarnings used to mask + # warning from R which may happen for + # large values in X. + res <- all(suppressWarnings(x %% 1) == 0) + if (!isTRUE(res)) { + errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try") + return(errorMessage) + } else { + return(TRUE) } - invisible(df) } .copySubsetDefinitions <- function(copyToCds, copyFromCds) { diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R index b82cdd1..d4588b4 100644 --- a/R/CohortGenerator.R +++ b/R/CohortGenerator.R @@ -19,12 +19,14 @@ #' @import DatabaseConnector #' @import R6 +#' @import dplyr #' @importFrom grDevices rgb +#' @importFrom methods is #' @importFrom stats aggregate setNames #' @importFrom utils write.csv install.packages menu packageVersion sessionInfo -#' @importFrom dplyr "%>%" #' @importFrom rlang .data ':=' NULL # Add custom assertions -assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns) +assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns) +assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger) diff --git a/R/CohortSample.R b/R/CohortSample.R index 6cb393b..e522244 100644 --- a/R/CohortSample.R +++ b/R/CohortSample.R @@ -262,7 +262,7 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet, ) if (nrow(sampleTable) == 0) { - ParallelLogger::logInfo("No entires found for ", targetCohortId, " was it generated?") + rlang::inform(paste0("No entires found for ", targetCohortId, " was it generated?")) return(sampledCohortDefinition) } # Called only for side effects diff --git a/R/CohortStats.R b/R/CohortStats.R index 9b1c2b5..2ddaa84 100644 --- a/R/CohortStats.R +++ b/R/CohortStats.R @@ -47,14 +47,6 @@ insertInclusionRuleNames <- function(connectionDetails = NULL, stop("You must provide either a database connection or the connection details.") } - checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") - checkmate::assertNames(colnames(cohortDefinitionSet), - must.include = c( - "cohortId", - "cohortName", - "json" - ) - ) if (is.null(connection)) { connection <- DatabaseConnector::connect(connectionDetails) on.exit(DatabaseConnector::disconnect(connection)) @@ -65,44 +57,7 @@ insertInclusionRuleNames <- function(connectionDetails = NULL, stop(paste0(cohortInclusionTable, " table not found in schema: ", cohortDatabaseSchema, ". Please make sure the table is created using the createCohortTables() function before calling this function.")) } - # Assemble the cohort inclusion rules - # NOTE: This data frame must match the @cohort_inclusion_table - # structure as defined in inst/sql/sql_server/CreateCohortTables.sql - inclusionRules <- data.frame( - cohortDefinitionId = bit64::integer64(), - ruleSequence = integer(), - name = character(), - description = character() - ) - # Remove any cohort definitions that do not include the JSON property - cohortDefinitionSet <- cohortDefinitionSet[!(is.null(cohortDefinitionSet$json) | is.na(cohortDefinitionSet$json)), ] - for (i in 1:nrow(cohortDefinitionSet)) { - cohortDefinition <- RJSONIO::fromJSON(content = cohortDefinitionSet$json[i], digits = 23) - if (!is.null(cohortDefinition$InclusionRules)) { - nrOfRules <- length(cohortDefinition$InclusionRules) - if (nrOfRules > 0) { - for (j in 1:nrOfRules) { - ruleName <- cohortDefinition$InclusionRules[[j]]$name - ruleDescription <- cohortDefinition$InclusionRules[[j]]$description - if (is.na(ruleName) || ruleName == "") { - ruleName <- paste0("Unamed rule (Sequence ", j - 1, ")") - } - if (is.null(ruleDescription)) { - ruleDescription <- "" - } - inclusionRules <- rbind( - inclusionRules, - data.frame( - cohortDefinitionId = bit64::as.integer64(cohortDefinitionSet$cohortId[i]), - ruleSequence = as.integer(j - 1), - name = ruleName, - description = ruleDescription - ) - ) - } - } - } - } + inclusionRules <- getCohortInclusionRules(cohortDefinitionSet) # Remove any existing data to prevent duplication DatabaseConnector::renderTranslateExecuteSql( @@ -116,7 +71,7 @@ insertInclusionRuleNames <- function(connectionDetails = NULL, # Insert the inclusion rules if (nrow(inclusionRules) > 0) { - ParallelLogger::logInfo("Inserting inclusion rule names") + rlang::inform("Inserting inclusion rule names") DatabaseConnector::insertTable( connection = connection, databaseSchema = cohortDatabaseSchema, @@ -152,7 +107,7 @@ getStatsTable <- function(connectionDetails, databaseId <- NULL } - ParallelLogger::logInfo("- Fetching data from ", table) + rlang::inform(paste0("- Fetching data from ", table)) sql <- "SELECT {@database_id != ''}?{CAST('@database_id' as VARCHAR(255)) as database_id,} t.* FROM @cohort_database_schema.@table t" data <- DatabaseConnector::renderTranslateQuerySql( sql = sql, @@ -174,6 +129,7 @@ getStatsTable <- function(connectionDetails, } #' Get Cohort Inclusion Stats Table Data +#' #' @description #' This function returns a data frame of the data in the Cohort Inclusion Tables. #' Results are organized in to a list with 5 different data frames: @@ -244,3 +200,67 @@ getCohortStats <- function(connectionDetails, } return(results) } + + +#' Get Cohort Inclusion Rules from a cohort definition set +#' +#' @description +#' This function returns a data frame of the inclusion rules defined +#' in a cohort definition set. +#' +#' @md +#' @template CohortDefinitionSet +#' +#' @export +getCohortInclusionRules <- function(cohortDefinitionSet) { + checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") + checkmate::assertNames(colnames(cohortDefinitionSet), + must.include = c( + "cohortId", + "cohortName", + "json" + ) + ) + + # Assemble the cohort inclusion rules + # NOTE: This data frame must match the @cohort_inclusion_table + # structure as defined in inst/sql/sql_server/CreateCohortTables.sql + inclusionRules <- data.frame( + cohortDefinitionId = bit64::integer64(), + ruleSequence = integer(), + name = character(), + description = character() + ) + + # Remove any cohort definitions that do not include the JSON property + cohortDefinitionSet <- cohortDefinitionSet[!(is.null(cohortDefinitionSet$json) | is.na(cohortDefinitionSet$json)), ] + for (i in 1:nrow(cohortDefinitionSet)) { + cohortDefinition <- RJSONIO::fromJSON(content = cohortDefinitionSet$json[i], digits = 23) + if (!is.null(cohortDefinition$InclusionRules)) { + nrOfRules <- length(cohortDefinition$InclusionRules) + if (nrOfRules > 0) { + for (j in 1:nrOfRules) { + ruleName <- cohortDefinition$InclusionRules[[j]]$name + ruleDescription <- cohortDefinition$InclusionRules[[j]]$description + if (is.na(ruleName) || ruleName == "") { + ruleName <- paste0("Unamed rule (Sequence ", j - 1, ")") + } + if (is.null(ruleDescription)) { + ruleDescription <- "" + } + inclusionRules <- rbind( + inclusionRules, + data.frame( + cohortDefinitionId = bit64::as.integer64(cohortDefinitionSet$cohortId[i]), + ruleSequence = as.integer(j - 1), + name = ruleName, + description = ruleDescription + ) + ) + } + } + } + } + + invisible(inclusionRules) +} diff --git a/R/CohortTables.R b/R/CohortTables.R index bf0e2b9..19a57c7 100644 --- a/R/CohortTables.R +++ b/R/CohortTables.R @@ -98,13 +98,13 @@ createCohortTables <- function(connectionDetails = NULL, for (i in 1:length(cohortTableNames)) { if (toupper(cohortTableNames[i]) %in% toupper(tables)) { createTableFlagList[i] <- FALSE - ParallelLogger::logInfo("Table \"", cohortTableNames[i], "\" already exists and in incremental mode, so not recreating it.") + rlang::inform(paste0("Table \"", cohortTableNames[i], "\" already exists and in incremental mode, so not recreating it.")) } } } if (any(unlist(createTableFlagList, use.names = FALSE))) { - ParallelLogger::logInfo("Creating cohort tables") + rlang::inform("Creating cohort tables") createSampleTable <- ifelse( test = is.null(createTableFlagList$cohortSampleTable), yes = FALSE, @@ -137,7 +137,7 @@ createCohortTables <- function(connectionDetails = NULL, DatabaseConnector::executeSql(connection, sql, progressBar = FALSE, reportOverallTime = FALSE) logCreateTableMessage <- function(schema, tableName) { - ParallelLogger::logInfo("- Created table ", schema, ".", tableName) + rlang::inform(paste0("- Created table ", schema, ".", tableName)) } for (i in 1:length(createTableFlagList)) { if (createTableFlagList[[i]]) { @@ -146,7 +146,7 @@ createCohortTables <- function(connectionDetails = NULL, } delta <- Sys.time() - start - ParallelLogger::logInfo("Creating cohort tables took ", round(delta, 2), attr(delta, "units")) + rlang::inform(paste0("Creating cohort tables took ", round(delta, 2), attr(delta, "units"))) } } @@ -173,7 +173,7 @@ dropCohortStatsTables <- function(connectionDetails = NULL, # Export the stats dropTable <- function(table) { - ParallelLogger::logInfo("- Dropping ", table) + rlang::inform(paste0("- Dropping ", table)) sql <- "TRUNCATE TABLE @cohort_database_schema.@table; DROP TABLE @cohort_database_schema.@table;" DatabaseConnector::renderTranslateExecuteSql( diff --git a/R/CsvHelper.R b/R/CsvHelper.R index 25d3d10..a1a55b1 100644 --- a/R/CsvHelper.R +++ b/R/CsvHelper.R @@ -28,12 +28,42 @@ #' @param warnOnCaseMismatch When TRUE, raise a warning if column headings #' in the .csv are not in snake_case format #' +#' @param colTypes Corresponds to the `col_types` in the `readr::read_csv` function. +#' One of `NULL`, a [readr::cols()] specification, or +#' a string. See `vignette("readr")` for more details. +#' +#' If `NULL`, all column types will be inferred from `guess_max` rows of the +#' input, interspersed throughout the file. This is convenient (and fast), +#' but not robust. If the guessed types are wrong, you'll need to increase +#' `guess_max` or supply the correct types yourself. +#' +#' Column specifications created by [list()] or [cols()] must contain +#' one column specification for each column. +#' +#' Alternatively, you can use a compact string representation where each +#' character represents one column: +#' - c = character +#' - i = integer +#' - n = number +#' - d = double +#' - l = logical +#' - f = factor +#' - D = date +#' - T = date time +#' - t = time +#' - ? = guess +#' - _ or - = skip +#' +#' By default, reading a file without a column specification will print a +#' message showing what `readr` guessed they were. To remove this message, +#' set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`. +#' #' @return #' A tibble with the .csv contents #' #' @export -readCsv <- function(file, warnOnCaseMismatch = TRUE) { - fileContents <- .readCsv(file = file) +readCsv <- function(file, warnOnCaseMismatch = TRUE, colTypes = readr::cols()) { + fileContents <- .readCsv(file = file, colTypes = colTypes) columnNames <- colnames(fileContents) columnNamesInSnakeCaseFormat <- isSnakeCase(columnNames) if (!all(columnNamesInSnakeCaseFormat) && warnOnCaseMismatch) { @@ -58,10 +88,10 @@ readCsv <- function(file, warnOnCaseMismatch = TRUE) { #' #' @noRd #' @keywords internal -.readCsv <- function(file) { +.readCsv <- function(file, colTypes = readr::cols()) { invisible(readr::read_csv( file = file, - col_types = readr::cols(), + col_types = colTypes, lazy = FALSE )) } diff --git a/R/Export.R b/R/Export.R index f351821..bab20f6 100644 --- a/R/Export.R +++ b/R/Export.R @@ -19,7 +19,14 @@ #' @description #' This function retrieves the data from the cohort statistics tables and #' writes them to the inclusion statistics folder specified in the function -#' call. +#' call. NOTE: inclusion rule names are handled in one of two ways: +#' +#' 1. You can specify the cohortDefinitionSet parameter and the inclusion rule +#' names will be extracted from the data.frame. +#' 2. You can insert the inclusion rule names into the database using the +#' insertInclusionRuleNames function of this package. +#' +#' The first approach is preferred as to avoid the warning emitted. #' #' @template Connection #' @@ -39,6 +46,11 @@ #' @param databaseId Optional - when specified, the databaseId will be added #' to the exported results #' +#' @template CohortDefinitionSet +#' +#' @param tablePrefix Optional - allows to append a prefix to the exported +#' file names. +#' #' @export exportCohortStatsTables <- function(connectionDetails, connection = NULL, @@ -48,7 +60,9 @@ exportCohortStatsTables <- function(connectionDetails, snakeCaseToCamelCase = TRUE, fileNamesInSnakeCase = FALSE, incremental = FALSE, - databaseId = NULL) { + databaseId = NULL, + cohortDefinitionSet = NULL, + tablePrefix = "") { if (is.null(connection)) { # Establish the connection and ensure the cleanup is performed connection <- DatabaseConnector::connect(connectionDetails) @@ -59,21 +73,12 @@ exportCohortStatsTables <- function(connectionDetails, dir.create(cohortStatisticsFolder, recursive = TRUE) } - # Export the stats - exportStats <- function(table, + # Internal function to export the stats + exportStats <- function(data, fileName, - includeDatabaseId) { - data <- getStatsTable( - connection = connection, - table = table, - snakeCaseToCamelCase = snakeCaseToCamelCase, - databaseId = databaseId, - cohortDatabaseSchema = cohortDatabaseSchema, - includeDatabaseId = includeDatabaseId - ) - - fullFileName <- file.path(cohortStatisticsFolder, fileName) - ParallelLogger::logInfo("- Saving data to - ", fullFileName) + tablePrefix) { + fullFileName <- file.path(cohortStatisticsFolder, paste0(tablePrefix, fileName)) + rlang::inform(paste0("- Saving data to - ", fullFileName)) if (incremental) { if (snakeCaseToCamelCase) { cohortDefinitionIds <- unique(data$cohortDefinitionId) @@ -88,39 +93,113 @@ exportCohortStatsTables <- function(connectionDetails, } tablesToExport <- data.frame( - tableName = cohortTableNames$cohortInclusionTable, - fileName = "cohort_inclusion.csv", - includeDatabaseId = FALSE + tableName = c("cohortInclusionResultTable", "cohortInclusionStatsTable", "cohortSummaryStatsTable", "cohortCensorStatsTable"), + fileName = c("cohort_inc_result.csv", "cohort_inc_stats.csv", "cohort_summary_stats.csv", "cohort_censor_stats.csv") + ) + + if (is.null(cohortDefinitionSet)) { + warning("No cohortDefinitionSet specified; please make sure you've inserted the inclusion rule names using the insertInclusionRuleNames function.") + tablesToExport <- rbind(tablesToExport, data.frame( + tableName = "cohortInclusionTable", + fileName = paste0(tablePrefix, "cohort_inclusion.csv") + )) + } else { + inclusionRules <- getCohortInclusionRules(cohortDefinitionSet) + names(inclusionRules) <- SqlRender::camelCaseToSnakeCase(names(inclusionRules)) + exportStats( + data = inclusionRules, + fileName = "cohort_inclusion.csv", + tablePrefix = tablePrefix + ) + } + + # Get the cohort statistics + cohortStats <- getCohortStats( + connectionDetails = connectionDetails, + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + databaseId = databaseId, + snakeCaseToCamelCase = snakeCaseToCamelCase, + cohortTableNames = cohortTableNames ) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortInclusionResultTable, - fileName = "cohort_inc_result.csv", - includeDatabaseId = TRUE - )) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortInclusionStatsTable, - fileName = "cohort_inc_stats.csv", - includeDatabaseId = TRUE - )) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortSummaryStatsTable, - fileName = "cohort_summary_stats.csv", - includeDatabaseId = TRUE - )) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortCensorStatsTable, - fileName = "cohort_censor_stats.csv", - includeDatabaseId = TRUE - )) + for (i in 1:nrow(tablesToExport)) { fileName <- ifelse(test = fileNamesInSnakeCase, yes = tablesToExport$fileName[i], no = SqlRender::snakeCaseToCamelCase(tablesToExport$fileName[i]) ) exportStats( - table = tablesToExport$tableName[i], + data = cohortStats[[tablesToExport$tableName[i]]], fileName = fileName, - includeDatabaseId = tablesToExport$includeDatabaseId[i] + tablePrefix = tablePrefix ) } } + +exportCohortDefinitionSet <- function(outputFolder, cohortDefinitionSet = NULL) { + cohortDefinitions <- createEmptyResult("cg_cohort_definition") + cohortSubsets <- createEmptyResult("cg_cohort_subset_definition") + if (!is.null(cohortDefinitionSet)) { + cdsCohortSubsets <- getSubsetDefinitions(cohortDefinitionSet) + if (length(cdsCohortSubsets) > 0) { + for (i in seq_along(cdsCohortSubsets)) { + cohortSubsets <- rbind( + cohortSubsets, + data.frame( + subsetDefinitionId = cdsCohortSubsets[[i]]$definitionId, + json = as.character(cdsCohortSubsets[[i]]$toJSON()) + ) + ) + } + } else { + # NOTE: In this case the cohortDefinitionSet has no subsets defined + # and so we need to add the additional columns that are defined + # in the function: addCohortSubsetDefinition. To do this, + # we'll construct a copy of the cohortDefinitionSet with a single + # subset to get the proper structure and filter it to the + # cohorts of interest. + cdsCopy <- cohortDefinitionSet %>% + addCohortSubsetDefinition( + cohortSubsetDefintion = createCohortSubsetDefinition( + definitionId = 1, + name = "empty", + subsetOperators = list( + createDemographicSubset() + ) + ) + ) %>% + dplyr::filter(.data$cohortId == cohortDefinitionSet$cohortId) + cohortDefinitionSet <- cdsCopy + } + # Massage and save the cohort definition set + colsToRename <- c("cohortId", "cohortName", "sql", "json") + colInd <- which(names(cohortDefinitionSet) %in% colsToRename) + names(cohortDefinitionSet)[colInd] <- c("cohortDefinitionId", "cohortName", "sqlCommand", "json") + if (!"description" %in% names(cohortDefinitionSet)) { + cohortDefinitionSet$description <- "" + } + cohortDefinitions <- cohortDefinitionSet[, intersect(names(cohortDefinitions), names(cohortDefinitionSet))] + } + writeCsv( + x = cohortDefinitions, + file = file.path(outputFolder, "cg_cohort_definition.csv") + ) + writeCsv( + x = cohortSubsets, + file = file.path(outputFolder, "cg_cohort_subset_definition.csv") + ) +} + +createEmptyResult <- function(tableName) { + columns <- readCsv( + file = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator") + ) %>% + dplyr::filter(.data$tableName == !!tableName) %>% + dplyr::pull(.data$columnName) %>% + SqlRender::snakeCaseToCamelCase() + result <- vector(length = length(columns)) + names(result) <- columns + result <- tibble::as_tibble(t(result), name_repair = "check_unique") + result <- result[FALSE, ] + return(result) +} diff --git a/R/Incremental.R b/R/Incremental.R index de400e4..cb578b0 100644 --- a/R/Incremental.R +++ b/R/Incremental.R @@ -70,7 +70,7 @@ isTaskRequired <- function(..., checksum, recordKeepingFile, verbose = TRUE) { if (verbose) { key <- list(...) key <- paste(sprintf("%s = '%s'", names(key), key), collapse = ", ") - ParallelLogger::logInfo("Skipping ", key, " because it is unchanged from earlier run") + rlang::inform(paste0("Skipping ", key, " because it is unchanged from earlier run")) } return(FALSE) } else { @@ -115,7 +115,7 @@ getRequiredTasks <- function(..., checksum, recordKeepingFile) { tasks$checksum <- NULL if (length(idx) > 0) { text <- paste(sprintf("%s = %s", names(tasks), tasks[idx, ]), collapse = ", ") - ParallelLogger::logInfo("Skipping ", text, " because it is unchanged from earlier run") + rlang::inform(paste0("Skipping ", text, " because it is unchanged from earlier run")) tasks <- tasks[-idx, ] } } diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index 516662c..a031078 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -29,36 +29,17 @@ #' @export createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) { checkmate::assert_logical(verbose) - negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification() + df <- data.frame( + cohortId = numeric(), + cohortName = character(), + outcomeConceptId = numeric() + ) if (verbose) { - print(negativeControlOutcomeCohortSetSpecification) + print(df) } - # Build the data.frame dynamically - df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification) invisible(df) } -#' Helper function to return the specification description of a -#' negativeControlOutcomeCohortSet -#' -#' @description -#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv -#' to return a data.frame that describes the required columns in a -#' negativeControlOutcomeCohortSet -#' -#' @return -#' Returns a data.frame that defines a negativeControlOutcomeCohortSet -#' -#' @noRd -#' @keywords internal -.getNegativeControlOutcomeCohortSetSpecification <- function() { - return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE - ))) -} - - #' Generate a set of negative control outcome cohorts #' #' @description @@ -111,12 +92,14 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first")) checkmate::assert_logical(detectOnDescendants) checkmate::assertNames(colnames(negativeControlOutcomeCohortSet), - must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName + must.include = names(createEmptyNegativeControlOutcomeCohortSet()) ) checkmate::assert_data_frame( x = negativeControlOutcomeCohortSet, min.rows = 1 ) + assertLargeInteger(negativeControlOutcomeCohortSet$cohortId) + assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId") # Verify that cohort IDs are not repeated in the negative control # cohort definition set before generating @@ -163,21 +146,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, stop(paste0("Table: ", cohortTable, " not found in schema: ", cohortDatabaseSchema, ". Please use `createCohortTable` to ensure the cohort table is created before generating cohorts.")) } - ParallelLogger::logInfo("Generating negative control outcome cohorts") - - # Send the negative control outcome cohort set to the server for use - # in processing. This temp table will hold the mapping between - # cohort_definition_id and the outcomeConceptId in the data.frame() - DatabaseConnector::insertTable( - connection = connection, - data = negativeControlOutcomeCohortSet, - tempEmulationSchema = tempEmulationSchema, - tableName = "#nc_set", - camelCaseToSnakeCase = TRUE, - dropTableIfExists = TRUE, - createTable = TRUE, - tempTable = TRUE - ) + rlang::inform("Generating negative control outcome cohorts") sql <- createNegativeControlOutcomesQuery( connection = connection, @@ -186,7 +155,8 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = cohortTable, occurrenceType = occurrenceType, - detectOnDescendants = detectOnDescendants + detectOnDescendants = detectOnDescendants, + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet ) DatabaseConnector::executeSql( @@ -213,7 +183,31 @@ createNegativeControlOutcomesQuery <- function(connection, cohortDatabaseSchema, cohortTable, occurrenceType, - detectOnDescendants) { + detectOnDescendants, + negativeControlOutcomeCohortSet) { + selectClause <- "" + for (i in 1:nrow(negativeControlOutcomeCohortSet)) { + selectClause <- paste0( + selectClause, + "SELECT CAST(", negativeControlOutcomeCohortSet$cohortId[i], " AS BIGINT), ", + "CAST(", negativeControlOutcomeCohortSet$outcomeConceptId[i], " AS BIGINT)" + ) + if (i < nrow(negativeControlOutcomeCohortSet)) { + selectClause <- paste0(selectClause, "\nUNION\n") + } + } + selectClause + ncSetQuery <- paste0( + "CREATE TABLE #nc_set (", + " cohort_id bigint NOT NULL,", + " outcome_concept_id bigint NOT NULL", + ")", + ";", + "INSERT INTO #nc_set (cohort_id, outcome_concept_id)\n", + selectClause, + "\n;" + ) + sql <- sql <- SqlRender::readSql(system.file("sql/sql_server/NegativeControlOutcomes.sql", package = "CohortGenerator", mustWork = TRUE)) sql <- SqlRender::render( sql = sql, @@ -222,6 +216,7 @@ createNegativeControlOutcomesQuery <- function(connection, cohort_table = cohortTable, detect_on_descendants = detectOnDescendants, occurrence_type = occurrenceType, + nc_set_query = ncSetQuery, warnOnMissingParameters = TRUE ) sql <- SqlRender::translate( diff --git a/R/ResultsDataModel.R b/R/ResultsDataModel.R new file mode 100644 index 0000000..07643f0 --- /dev/null +++ b/R/ResultsDataModel.R @@ -0,0 +1,147 @@ +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of CohortGenerator +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#' Get specifications for CohortGenerator results data model +#' +#' @return +#' A tibble data frame object with specifications +#' +#' @export +getResultsDataModelSpecifications <- function() { + resultsDataModelSpecifications <- readCsv( + file = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator") + ) + return(resultsDataModelSpecifications) +} + +#' Create the results data model tables on a database server. +#' +#' @details +#' Only PostgreSQL and SQLite servers are supported. +#' +#' @param connectionDetails DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails] +#' @param databaseSchema The schema on the server where the tables will be created. +#' @param tablePrefix (Optional) string to insert before table names for database table names +#' @export +createResultsDataModel <- function(connectionDetails = NULL, + databaseSchema, + tablePrefix = "") { + if (connectionDetails$dbms == "sqlite" & databaseSchema != "main") { + stop("Invalid schema for sqlite, use databaseSchema = 'main'") + } + + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + # Create first version of results model: + sql <- SqlRender::readSql(system.file("sql/sql_server/CreateResultsDataModel.sql", package = "CohortGenerator", mustWork = TRUE)) + sql <- SqlRender::render( + sql = sql, + database_schema = databaseSchema, + table_prefix = tablePrefix + ) + sql <- SqlRender::translate(sql = sql, targetDialect = connection@dbms) + DatabaseConnector::executeSql(connection, sql) + # Migrate to current version: + migrateDataModel( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix + ) +} + +#' Upload results to the database server. +#' +#' @description +#' Requires the results data model tables have been created using the \code{\link{createResultsDataModel}} function. +#' +#' @param connectionDetails An object of type \code{connectionDetails} as created using the +#' \code{\link[DatabaseConnector]{createConnectionDetails}} function in the +#' DatabaseConnector package. +#' @param schema The schema on the server where the tables have been created. +#' @param resultsFolder The folder holding the results in .csv files +#' @param forceOverWriteOfSpecifications If TRUE, specifications of the phenotypes, cohort definitions, and analysis +#' will be overwritten if they already exist on the database. Only use this if these specifications +#' have changed since the last upload. +#' @param purgeSiteDataBeforeUploading If TRUE, before inserting data for a specific databaseId all the data for +#' that site will be dropped. This assumes the resultsFolder file contains the full data for that +#' data site. +#' @param tablePrefix (Optional) string to insert before table names for database table names +#' @param ... See ResultModelManager::uploadResults +#' @export +uploadResults <- function(connectionDetails, + schema, + resultsFolder, + forceOverWriteOfSpecifications = FALSE, + purgeSiteDataBeforeUploading = TRUE, + tablePrefix = "", + ...) { + ResultModelManager::uploadResults( + connectionDetails = connectionDetails, + schema = schema, + resultsFolder = resultsFolder, + tablePrefix = tablePrefix, + forceOverWriteOfSpecifications = forceOverWriteOfSpecifications, + purgeSiteDataBeforeUploading = purgeSiteDataBeforeUploading, + runCheckAndFixCommands = FALSE, + specifications = getResultsDataModelSpecifications(), + warnOnMissingTable = FALSE, + ... + ) +} + +#' Migrate Data model +#' @description +#' Migrate data from current state to next state +#' +#' It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +#' are using a postgres backend) or have kept the csv/zip files from your data generation. +#' +#' @inheritParams getDataMigrator +#' @export +migrateDataModel <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ParallelLogger::logInfo("Migrating data set") + migrator <- getDataMigrator( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix + ) + migrator$executeMigrations() + migrator$finalize() +} + +#' Get database migrations instance +#' @description +#' +#' Returns ResultModelManager DataMigrationsManager instance. +# '@seealso [ResultModelManager::DataMigrationManager] which this function is a utility for. +#' +#' @param connectionDetails DatabaseConnector connection details object +#' @param databaseSchema String schema where database schema lives +#' @param tablePrefix (Optional) Use if a table prefix is used before table names (e.g. "cg_") +#' @returns Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models +#' @export +getDataMigrator <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ResultModelManager::DataMigrationManager$new( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix, + packageTablePrefix = "cg_", + migrationPath = "migrations", + packageName = "CohortGenerator" + ) +} diff --git a/R/RunCohortGeneration.R b/R/RunCohortGeneration.R new file mode 100644 index 0000000..e1565d0 --- /dev/null +++ b/R/RunCohortGeneration.R @@ -0,0 +1,306 @@ +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of CohortGenerator +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Run a cohort generation and export results +#' +#' @details +#' Run a cohort generation for a set of cohorts and negative control outcomes. +#' This function will also export the results of the run to the `outputFolder`. +#' +#' @param connectionDetails An object of type \code{connectionDetails} as created using the +#' \code{\link[DatabaseConnector]{createConnectionDetails}} function in the +#' DatabaseConnector package. +#' +#' @template CdmDatabaseSchema +#' +#' @template TempEmulationSchema +#' +#' @template CohortTableNames +#' +#' @template CohortDefinitionSet +#' +#' @template NegativeControlOutcomeCohortSet +#' +#' @param occurrenceType For negative controls outcomes, the occurrenceType +#' will detect either: the first time an +#' outcomeConceptId occurs or all times the +#' outcomeConceptId occurs for a person. Values +#' accepted: 'all' or 'first'. +#' +#' @param detectOnDescendants For negative controls outcomes, when set to TRUE, +#' detectOnDescendants will use the vocabulary to +#' find negative control outcomes using the +#' outcomeConceptId and all descendants via the +#' concept_ancestor table. When FALSE, only the exact +#' outcomeConceptId will be used to detect the +#' outcome. +#' +#' @param stopOnError If an error happens while generating one of the +#' cohorts in the cohortDefinitionSet, should we +#' stop processing the other cohorts? The default is +#' TRUE; when set to FALSE, failures will be +#' identified in the return value from this function. +#' +#' @param outputFolder Name of the folder where all the outputs will written to. +#' +#' @param databaseId A unique ID for the database. This will be appended to +#' most tables. +#' +#' @param incremental Create only cohorts that haven't been created before? +#' +#' @param incrementalFolder If \code{incremental = TRUE}, specify a folder where +#' records are kept of which definition has been +#' executed. +#' +#' @export +runCohortGeneration <- function(connectionDetails, + cdmDatabaseSchema, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + cohortDatabaseSchema = cdmDatabaseSchema, + cohortTableNames = getCohortTableNames(), + cohortDefinitionSet = NULL, + negativeControlOutcomeCohortSet = NULL, + occurrenceType = "all", + detectOnDescendants = FALSE, + stopOnError = TRUE, + outputFolder, + databaseId = 1, + incremental = FALSE, + incrementalFolder = NULL) { + if (is.null(cohortDefinitionSet) && is.null(negativeControlOutcomeCohortSet)) { + stop("You must supply at least 1 cohortDefinitionSet OR 1 negativeControlOutcomeCohortSet") + } + errorMessages <- checkmate::makeAssertCollection() + if (is(connectionDetails, "connectionDetails")) { + checkmate::assertClass(connectionDetails, "connectionDetails", add = errorMessages) + } else { + checkmate::assertClass(connectionDetails, "ConnectionDetails", add = errorMessages) + } + checkmate::assertCharacter(cdmDatabaseSchema, len = 1, add = errorMessages) + checkmate::assertCharacter(tempEmulationSchema, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertCharacter(cohortDatabaseSchema, len = 1, add = errorMessages) + checkmate::assertList(cohortTableNames, min.len = 1, add = errorMessages) + checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertDataFrame(negativeControlOutcomeCohortSet, min.rows = 1, null.ok = TRUE, add = errorMessages) + checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"), add = errorMessages) + checkmate::assert_logical(detectOnDescendants, add = errorMessages) + checkmate::assert_logical(stopOnError, add = errorMessages) + checkmate::reportAssertions(collection = errorMessages) + + # Establish the connection and ensure the cleanup is performed + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + # Create the export folder + if (!dir.exists(outputFolder)) { + dir.create(outputFolder, recursive = T) + } + + # Create the cohort tables + createCohortTables( + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + incremental = incremental + ) + + generateAndExportCohorts( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + cohortDefinitionSet = cohortDefinitionSet, + stopOnError = stopOnError, + outputFolder = outputFolder, + databaseId = databaseId, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + generateAndExportNegativeControls( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet, + occurrenceType = occurrenceType, + detectOnDescendants = detectOnDescendants, + outputFolder = outputFolder, + databaseId = databaseId, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + # Export the results data model specification + file.copy( + from = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator"), + to = outputFolder + ) + + rlang::inform("Cohort generation complete.") +} + +generateAndExportCohorts <- function(connection, + cdmDatabaseSchema, + tempEmulationSchema, + cohortDatabaseSchema, + cohortTableNames, + cohortDefinitionSet, + stopOnError, + outputFolder, + databaseId, + incremental, + incrementalFolder) { + # Generate the cohorts + cohortsGenerated <- createEmptyResult("cg_cohort_generation") + cohortsGeneratedFileName <- file.path(outputFolder, "cg_cohort_generation.csv") + cohortCounts <- createEmptyResult("cg_cohort_count") + cohortCountsFileName <- file.path(outputFolder, "cg_cohort_count.csv") + if (!is.null(cohortDefinitionSet)) { + # Generate cohorts, get counts, write results + cohortsGenerated <- generateCohortSet( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + cohortDefinitionSet = cohortDefinitionSet, + stopOnError = stopOnError, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + cohortCountsFromDb <- getCohortCounts( + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTableNames$cohortTable, + cohortDefinitionSet = cohortDefinitionSet, + databaseId = databaseId + ) + + # Filter to columns in the results data model + cohortCounts <- cohortCountsFromDb[names(cohortCounts)] + } + + # Save the generation information + rlang::inform("Saving cohort generation information") + if (!is.null(cohortsGenerated) && nrow(cohortsGenerated) > 0) { + cohortsGenerated$databaseId <- databaseId + # Remove any cohorts that were skipped + cohortsGenerated <- cohortsGenerated[toupper(cohortsGenerated$generationStatus) != "SKIPPED", ] + if (incremental) { + # Format the data for saving + names(cohortsGenerated) <- SqlRender::camelCaseToSnakeCase(names(cohortsGenerated)) + saveIncremental( + data = cohortsGenerated, + fileName = cohortsGeneratedFileName, + cohort_id = cohortsGenerated$cohort_id + ) + } else { + writeCsv( + x = cohortsGenerated, + file = cohortsGeneratedFileName + ) + } + } + + rlang::inform("Saving cohort counts") + writeCsv( + x = cohortCounts, + file = cohortCountsFileName + ) + + rlang::inform("Saving cohort statistics") + exportCohortStatsTables( + connection = connection, + cohortTableNames = cohortTableNames, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortStatisticsFolder = outputFolder, + snakeCaseToCamelCase = FALSE, + fileNamesInSnakeCase = TRUE, + incremental = incremental, + databaseId = databaseId, + cohortDefinitionSet = cohortDefinitionSet, + tablePrefix = "cg_" + ) + + # Export the cohort definition set + rlang::inform("Saving cohort definition set") + exportCohortDefinitionSet(outputFolder, cohortDefinitionSet) +} + +generateAndExportNegativeControls <- function(connection, + cdmDatabaseSchema, + tempEmulationSchema, + cohortDatabaseSchema, + cohortTableNames, + negativeControlOutcomeCohortSet, + occurrenceType, + detectOnDescendants, + outputFolder, + databaseId, + incremental, + incrementalFolder) { + # Generate any negative controls + negativeControlOutcomes <- createEmptyResult("cg_cohort_definition_neg_ctrl") + negativeControlOutcomesFileName <- file.path(outputFolder, "cg_cohort_definition_neg_ctrl.csv") + cohortCountsNegativeControlOutcomes <- createEmptyResult("cg_cohort_count_neg_ctrl") + cohortCountsNegativeControlOutcomesFileName <- file.path(outputFolder, "cg_cohort_count_neg_ctrl.csv") + if (!is.null(negativeControlOutcomeCohortSet)) { + generateNegativeControlOutcomeCohorts( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTableNames$cohortTable, + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet, + tempEmulationSchema = tempEmulationSchema, + occurrenceType = occurrenceType, + detectOnDescendants = detectOnDescendants, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + # Assemble the negativeControlOutcomes for export + negativeControlOutcomes <- cbind( + negativeControlOutcomeCohortSet, + occurrenceType = rep(occurrenceType, nrow(negativeControlOutcomeCohortSet)), + detectOnDescendants = rep(detectOnDescendants, nrow(negativeControlOutcomeCohortSet)) + ) + + # Count the negative controls + cohortCountsNegativeControlOutcomes <- getCohortCounts( + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTableNames$cohortTable, + databaseId = databaseId, + cohortDefinitionSet = negativeControlOutcomeCohortSet[, c("cohortId"), drop = FALSE] + ) + } + + rlang::inform("Saving negative control outcome cohort definition") + writeCsv( + x = negativeControlOutcomes, + file = negativeControlOutcomesFileName + ) + + rlang::inform("Saving negative control outcome cohort counts") + writeCsv( + x = cohortCountsNegativeControlOutcomes, + file = cohortCountsNegativeControlOutcomesFileName + ) +} diff --git a/R/SubsetDefinitions.R b/R/SubsetDefinitions.R index 4453fd6..19911c7 100644 --- a/R/SubsetDefinitions.R +++ b/R/SubsetDefinitions.R @@ -64,7 +64,7 @@ CohortSubsetDefinition <- R6::R6Class( subsetOperators = lapply(self$subsetOperators, function(operator) { operator$toList() }), - packageVersion = jsonlite::unbox(as.character(utils::packageVersion(utils::packageName()))), + packageVersion = jsonlite::unbox(as.character(utils::packageVersion("CohortGenerator"))), identifierExpression = jsonlite::unbox(as.character(private$.identifierExpression)), operatorNameConcatString = jsonlite::unbox(as.character(private$.operatorNameConcatString)), subsetCohortNameTemplate = jsonlite::unbox(as.character(private$.subsetCohortNameTemplate)) @@ -471,7 +471,7 @@ hasSubsetDefinitions <- function(x) { #' @description #' This is generally used as part of saveCohortDefinitionSet #' -#' @param subsetDefinition The subset definition object {@seealso CohortSubsetDefinition} +#' @param subsetDefinition The subset definition object @seealso[CohortSubsetDefinition] #' #' @export #' @inheritParams saveCohortDefinitionSet diff --git a/R/Subsets.R b/R/Subsets.R index c50a0f8..c157f72 100644 --- a/R/Subsets.R +++ b/R/Subsets.R @@ -35,7 +35,7 @@ # SubsetCohortWindow ------------- -#' SubsetCohortWindow settings +#' @title Time Window For Cohort Subset Operator #' @export #' @description #' Representation of a time window to use when subsetting a target cohort with a subset cohort @@ -47,7 +47,6 @@ SubsetCohortWindow <- R6::R6Class( .targetAnchor = "cohortStart" ), public = list( - #' @title to List #' @description List representation of object toList = function() { objRepr <- list() @@ -128,7 +127,7 @@ createSubsetCohortWindow <- function(startDay, endDay, targetAnchor) { } # SubsetOperator ------------------------------ -#' @title SubsetOperator +#' @title Abstract base class for subsets. #' @export #' @description #' Abstract Base Class for subsets. Subsets should inherit from this and implement their own requirements. @@ -403,7 +402,7 @@ CohortSubsetOperator <- R6::R6Class( #' #' @param startWindow A SubsetCohortWindow that patients must fall inside (see createSubsetCohortWindow) #' @param endWindow A SubsetCohortWindow that patients must fall inside (see createSubsetCohortWindow) -#' @param negate The opposite of this definition - include patients who do NOT meet the specified criteria (NOT YET IMPLEMENTED) +#' @param negate The opposite of this definition - include patients who do NOT meet the specified criteria #' @returns a CohortSubsetOperator instance createCohortSubset <- function(name = NULL, cohortIds, cohortCombinationOperator, negate, startWindow, endWindow) { subset <- CohortSubsetOperator$new() @@ -418,7 +417,10 @@ createCohortSubset <- function(name = NULL, cohortIds, cohortCombinationOperator } # DemographicSubsetOperator ------------------------------ -#' Criteria Subset +#' @title Demographic Subset Operator +#' @description +#' Operators for subsetting a cohort by demographic criteria +#' #' @export DemographicSubsetOperator <- R6::R6Class( classname = "DemographicSubsetOperator", diff --git a/_pkgdown.yml b/_pkgdown.yml index d0aa7e1..1258641 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -8,6 +8,13 @@ home: href: http://forums.ohdsi.org reference: + - title: "Cohort Generation" + desc: > + Functions that support generating cohorts. + contents: + - runCohortGeneration + - generateCohortSet + - title: "Cohort Tables" desc: > Functions that support creating the necessary cohort tables. @@ -25,13 +32,6 @@ reference: - checkAndFixCohortDefinitionSetDataTypes - isCohortDefinitionSet - - title: "Cohort Generation" - desc: > - Functions that support generating cohorts. - contents: - - generateCohortSet - - createEmptyCohortDefinitionSet - - title: "Cohort Counts" desc: > Function for obtaining the counts of subjects and events for one or @@ -39,23 +39,27 @@ reference: contents: - getCohortCounts - - title: "Cohort Subset" + - title: "Cohort Subset Functions" desc: > - Functions and R6 classes for creating cohort subset definitions and subset - operators. + Functions for creating cohort subset definitions and subset operators. contents: - addCohortSubsetDefinition - - CohortSubsetDefinition - - CohortSubsetOperator - createCohortSubset - createCohortSubsetDefinition - createDemographicSubset - createLimitSubset - createSubsetCohortWindow - - DemographicSubsetOperator - getSubsetDefinitions - - LimitSubsetOperator - saveCohortSubsetDefinition + + - title: "Cohort Subset Classes" + desc: > + R6 classes for cohort subset definitions and subset operators. + contents: + - CohortSubsetDefinition + - CohortSubsetOperator + - DemographicSubsetOperator + - LimitSubsetOperator - SubsetCohortWindow - SubsetOperator @@ -63,9 +67,12 @@ reference: desc: > Functions for inserting inclusion rule names from a cohort definition, exporting the cohort statistics to the file system and a helper function - for dropping those tables when they are no longer needed. + for dropping those tables when they are no longer needed. These functions + assume you are using [Circe](https://github.com/OHDSI/circe-be) for + inclusion rules and cohort statistics. contents: - getCohortStats + - getCohortInclusionRules - insertInclusionRuleNames - exportCohortStatsTables - dropCohortStatsTables @@ -77,6 +84,17 @@ reference: contents: - createEmptyNegativeControlOutcomeCohortSet - generateNegativeControlOutcomeCohorts + + - title: "Result Model Management" + desc: > + Functions for managing the results of running Cohort Generator via + `runCohortGeneration` + contents: + - createResultsDataModel + - getDataMigrator + - getResultsDataModelSpecifications + - migrateDataModel + - uploadResults - title: "CSV File Helpers" desc: > diff --git a/docs/404.html b/docs/404.html index 1fb5c5e..da143bd 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@
diff --git a/docs/articles/CreatingCohortSubsetDefinitions.html b/docs/articles/CreatingCohortSubsetDefinitions.html index 5e8b709..71a5f0e 100644 --- a/docs/articles/CreatingCohortSubsetDefinitions.html +++ b/docs/articles/CreatingCohortSubsetDefinitions.html @@ -33,7 +33,7 @@ @@ -91,7 +91,7 @@vignettes/CreatingCohortSubsetDefinitions.Rmd
CreatingCohortSubsetDefinitions.Rmd
We can also apply a subset definition to only a limited number of @@ -366,6 +386,15 @@
#> Cohort Id: 1778211001
-#> Subset Parent Id: 1778211
-#> Name celecoxib - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, Observation of at least 365 days prior
+#> Cohort Id: 1778214
+#> Subset Parent Id: 1778214
+#> Name celecoxibCensored
Note that when adding a subset definition to a cohort definition set,
the target cohort ids e.g (1778211, 1778212) must exist in the
cohortDefinitionSet
and the output ids
diff --git a/docs/articles/GeneratingCohorts.html b/docs/articles/GeneratingCohorts.html
index 2762151..69526fe 100644
--- a/docs/articles/GeneratingCohorts.html
+++ b/docs/articles/GeneratingCohorts.html
@@ -33,7 +33,7 @@
@@ -91,7 +91,7 @@
vignettes/GeneratingCohorts.Rmd
GeneratingCohorts.Rmd
#> Connecting using SQLite driver
-#> Counting cohorts took 0.126 secs
+#> Connecting using SQLite driver
+#> Counting cohorts took 0.131 secs
#> cohortId cohortEntries cohortSubjects
#> 1 1778211 1800 1800
#> 2 1778212 569 569
-#> 3 1778213 266 266
+#> 3 1778213 266 266
+#> 4 1778214 1750 1750
generatesStats
:
-+# Get the cohorts and include the code to generate inclusion rule stats cohortDefinitionSet <- ROhdsiWebApi::exportCohortDefinitionSet( baseUrl = baseUrl, @@ -296,7 +297,7 @@
Cohort Statistics (Inclusio )
Next we’ll create the tables to store the cohort and the cohort statistics. Then we can generate the cohorts.
-diff --git a/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html b/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html index 8c2f1e1..008b62d 100644 --- a/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html +++ b/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html @@ -18,7 +18,7 @@+diff --git a/docs/reference/createEmptyCohortDefinitionSet.html b/docs/reference/createEmptyCohortDefinitionSet.html index 31f3f6a..5edc70f 100644 --- a/docs/reference/createEmptyCohortDefinitionSet.html +++ b/docs/reference/createEmptyCohortDefinitionSet.html @@ -18,7 +18,7 @@# First get the cohort table names to use for this generation task cohortTableNames <- getCohortTableNames(cohortTable = "stats_example") @@ -321,7 +322,7 @@
Cohort Statistics (Inclusio are available in the cohort statistics tables. The next step is to export the results to the file system which is done using the code below: -
diff --git a/docs/reference/createDemographicSubset.html b/docs/reference/createDemographicSubset.html index a3879b9..1b231e9 100644 --- a/docs/reference/createDemographicSubset.html +++ b/docs/reference/createDemographicSubset.html @@ -17,7 +17,7 @@+diff --git a/docs/reference/createCohortTables.html b/docs/reference/createCohortTables.html index c28cf0c..4a7c50c 100644 --- a/docs/reference/createCohortTables.html +++ b/docs/reference/createCohortTables.html @@ -18,7 +18,7 @@insertInclusionRuleNames( connectionDetails = connectionDetails, cohortDefinitionSet = cohortDefinitionSet, @@ -344,7 +345,7 @@
Cohort Statistics (Inclusio table in the
InclusionStats
folder.Once you have exported your cohort statistics, you can optionally drop the statistics tables by using the following command:
-@@ -63,14 +62,13 @@+diff --git a/docs/reference/DemographicSubsetOperator.html b/docs/reference/DemographicSubsetOperator.html index b0286f9..58adfff 100644 --- a/docs/reference/DemographicSubsetOperator.html +++ b/docs/reference/DemographicSubsetOperator.html @@ -1,6 +1,5 @@ -dropCohortStatsTables( connectionDetails = connectionDetails, cohortDatabaseSchema = "main", @@ -362,7 +363,7 @@
Incremental Mode -
diff --git a/docs/reference/CohortSubsetOperator.html b/docs/reference/CohortSubsetOperator.html index eb899f8..a493c14 100644 --- a/docs/reference/CohortSubsetOperator.html +++ b/docs/reference/CohortSubsetOperator.html @@ -18,7 +18,7 @@++# Create a set of tables for this example cohortTableNames <- getCohortTableNames(cohortTable = "cohort") createCohortTables( @@ -376,15 +377,15 @@
Incremental Mode -
@@ -66,6 +66,11 @@+-createCohortTables( connectionDetails = connectionDetails, cohortTableNames = cohortTableNames, cohortDatabaseSchema = "main", incremental = TRUE )
-#> Connecting using SQLite driver
#> Table "cohort" already exists and in incremental mode, so not recreating it. +
#> Connecting using SQLite driver +#> Table "cohort" already exists and in incremental mode, so not recreating it. #> Table "cohort" already exists and in incremental mode, so not recreating it. #> Table "cohort_inclusion" already exists and in incremental mode, so not recreating it. #> Table "cohort_inclusion_result" already exists and in incremental mode, so not recreating it. @@ -394,7 +395,7 @@
Incremental ModeThe use of
incremental = TRUE
here allows for assurance that tables and results from previous runs are preserved. Next, we can generate ourcohortDefinitionSet
in incremental mode. -diff --git a/docs/news/index.html b/docs/news/index.html index abf1fc5..caab98e 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -17,7 +17,7 @@+diff --git a/docs/index.html b/docs/index.html index e325d4e..e722aa9 100644 --- a/docs/index.html +++ b/docs/index.html @@ -33,7 +33,7 @@generateCohortSet( connectionDetails = connectionDetails, cdmDatabaseSchema = "main", @@ -418,7 +419,7 @@
for the same cohort ID, the generation is skipped. To illustrate how this looks: -Incremental ModeincrementalFolder
diff --git a/docs/authors.html b/docs/authors.html index 934e4a3..83a259e 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@+generateCohortSet( connectionDetails = connectionDetails, cdmDatabaseSchema = "main", @@ -429,11 +430,12 @@
Incremental Mode incrementalFolder = file.path(someFolder, "RecordKeeping") )
-#> Connecting using SQLite driver
+#> Initiating cluster consisting only of main thread -#> Skipping cohortId = '1778211' because it is unchanged from earlier run +
+#> Initiating cluster consisting only of main thread
+#> Skipping cohortId = '1778214' because it is unchanged from earlier run#> Skipping cohortId = '1778211' because it is unchanged from earlier run #> Skipping cohortId = '1778212' because it is unchanged from earlier run #> Skipping cohortId = '1778213' because it is unchanged from earlier run -#> Generating cohort set took 0.11 secs
#> Generating cohort set took 0.13 secs
@@ -91,7 +91,7 @@Potential Pitfalls of Incremental Mode
diff --git a/docs/articles/SamplingCohorts.html b/docs/articles/SamplingCohorts.html index 76bc9ec..e856ab2 100644 --- a/docs/articles/SamplingCohorts.html +++ b/docs/articles/SamplingCohorts.html @@ -33,7 +33,7 @@Sampling Cohorts
James P. Gilbert
-2024-05-28
+2024-07-14
Source:vignettes/SamplingCohorts.Rmd
diff --git a/docs/articles/index.html b/docs/articles/index.html index c0fbc19..23c3513 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@SamplingCohorts.Rmd
Changelog
Source:NEWS.md
+CohortGenerator 0.10.0
+New Features - Add
+runCohortGeneration
function (Issue #165) - Adopt ResultModelManager for handling results data models & uploading. Extend results data model to include information on cohort subsets(#154, #162) - Remove REMOTES entries for CirceR and Eunomia which are now in CRAN (#145) - Unit tests now running on all OHDSI DB Platforms (#151)Bug Fixes - Negation of cohort subset operator must join on
+subject_id
ANDstart_date
(#167) - Allow integer as cohort ID (#146) - Use native messaging functions for output vs. ParallelLogger (#97) - Prevent upload of inclusion rule information (#78) - ExposecolTypes
when working with .csv files (#59) - Removebit64
from package (mostly) (#152) - Updated documentation for cohort subset negate feature (#111)diff --git a/docs/reference/CohortSubsetDefinition.html b/docs/reference/CohortSubsetDefinition.html index 3540129..f71367a 100644 --- a/docs/reference/CohortSubsetDefinition.html +++ b/docs/reference/CohortSubsetDefinition.html @@ -17,7 +17,7 @@CohortGenerator 0.9.0
- Random sample functionality (for development only) (Issue #129)
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index ec62a11..475ec55 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -5,5 +5,5 @@ articles: CreatingCohortSubsetDefinitions: CreatingCohortSubsetDefinitions.html GeneratingCohorts: GeneratingCohorts.html SamplingCohorts: SamplingCohorts.html -last_built: 2024-05-28T17:58Z +last_built: 2024-07-14T13:51Z diff --git a/docs/reference/CohortGenerator-package.html b/docs/reference/CohortGenerator-package.html index 94e32b7..7030844 100644 --- a/docs/reference/CohortGenerator-package.html +++ b/docs/reference/CohortGenerator-package.html @@ -17,7 +17,7 @@Criteria Subset — DemographicSubsetOperator • CohortGenerator Demographic Subset Operator — DemographicSubsetOperator • CohortGenerator @@ -18,7 +17,7 @@@@ -62,7 +62,7 @@diff --git a/docs/reference/SubsetCohortWindow.html b/docs/reference/SubsetCohortWindow.html index 08abaf9..3047776 100644 --- a/docs/reference/SubsetCohortWindow.html +++ b/docs/reference/SubsetCohortWindow.html @@ -1,5 +1,5 @@ --diff --git a/docs/reference/LimitSubsetOperator.html b/docs/reference/LimitSubsetOperator.html index 22f4022..fbcaf92 100644 --- a/docs/reference/LimitSubsetOperator.html +++ b/docs/reference/LimitSubsetOperator.html @@ -18,7 +18,7 @@Criteria Subset
-Criteria Subset
+Operators for subsetting a cohort by demographic criteria
to List — SubsetCohortWindow • CohortGenerator Time Window For Cohort Subset Operator — SubsetCohortWindow • CohortGenerator @@ -17,7 +17,7 @@@@ -98,7 +98,7 @@@@ -72,10 +72,6 @@-to List
-Details
-SubsetCohortWindow settings
-diff --git a/docs/reference/createCohortSubset.html b/docs/reference/createCohortSubset.html index d143bf1..ae84cb4 100644 --- a/docs/reference/createCohortSubset.html +++ b/docs/reference/createCohortSubset.html @@ -17,7 +17,7 @@Active bindings
@@ -62,7 +62,7 @@
- diff --git a/docs/reference/SubsetOperator.html b/docs/reference/SubsetOperator.html index b9629d1..1281969 100644 --- a/docs/reference/SubsetOperator.html +++ b/docs/reference/SubsetOperator.html @@ -1,5 +1,5 @@ -
startDay
SubsetOperator — SubsetOperator • CohortGenerator Abstract base class for subsets. — SubsetOperator • CohortGenerator @@ -17,7 +17,7 @@diff --git a/docs/reference/computeChecksum.html b/docs/reference/computeChecksum.html index 065b877..6b88acc 100644 --- a/docs/reference/computeChecksum.html +++ b/docs/reference/computeChecksum.html @@ -19,7 +19,7 @@diff --git a/docs/reference/addCohortSubsetDefinition.html b/docs/reference/addCohortSubsetDefinition.html index 90567f9..465e098 100644 --- a/docs/reference/addCohortSubsetDefinition.html +++ b/docs/reference/addCohortSubsetDefinition.html @@ -20,7 +20,7 @@diff --git a/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html b/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html index 947ac3f..b2ac30c 100644 --- a/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html +++ b/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html @@ -19,7 +19,7 @@Arguments
negate -+ The opposite of this definition - include patients who do NOT meet the specified criteria (NOT YET IMPLEMENTED)
The opposite of this definition - include patients who do NOT meet the specified criteria
startWindow diff --git a/docs/reference/createCohortSubsetDefinition.html b/docs/reference/createCohortSubsetDefinition.html index 73a692c..3244251 100644 --- a/docs/reference/createCohortSubsetDefinition.html +++ b/docs/reference/createCohortSubsetDefinition.html @@ -17,7 +17,7 @@
R/ResultsDataModel.R
+ createResultsDataModel.Rd
Create the results data model tables on a database server.
+createResultsDataModel(
+ connectionDetails = NULL,
+ databaseSchema,
+ tablePrefix = ""
+)
DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails]
The schema on the server where the tables will be created.
(Optional) string to insert before table names for database table names
Only PostgreSQL and SQLite servers are supported.
+This function retrieves the data from the cohort statistics tables and writes them to the inclusion statistics folder specified in the function -call.
+call. NOTE: inclusion rule names are handled in one of two ways: +1. You can specify the cohortDefinitionSet parameter and the inclusion rule +names will be extracted from the data.frame. +2. You can insert the inclusion rule names into the database using the +insertInclusionRuleNames function of this package.
+The first approach is preferred as to avoid the warning emitted.
Optional - when specified, the databaseId will be added to the exported results
The cohortDefinitionSet
argument must be a data frame with
+the following columns:
The unique integer identifier of the cohort
The cohort's name
The OHDSI-SQL used to generate the cohort
Optionally, this data frame may contain:
The Circe JSON representation of the cohort
Optional - allows to append a prefix to the exported +file names.
R/CohortStats.R
+ getCohortInclusionRules.Rd
This function returns a data frame of the inclusion rules defined +in a cohort definition set.
+getCohortInclusionRules(cohortDefinitionSet)
The cohortDefinitionSet
argument must be a data frame with
+the following columns:
The unique integer identifier of the cohort
The cohort's name
The OHDSI-SQL used to generate the cohort
Optionally, this data frame may contain:
The Circe JSON representation of the cohort
Returns ResultModelManager DataMigrationsManager instance.
+getDataMigrator(connectionDetails, databaseSchema, tablePrefix = "")
DatabaseConnector connection details object
String schema where database schema lives
(Optional) Use if a table prefix is used before table names (e.g. "cg_")
Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models
+R/ResultsDataModel.R
+ getResultsDataModelSpecifications.Rd
Get specifications for CohortGenerator results data model
+getResultsDataModelSpecifications()
A tibble data frame object with specifications
+
+ Cohort Generation+Functions that support generating cohorts. + |
+ ||
---|---|---|
+ + | +Run a cohort generation and export results |
+ |
+ + | +Generate a set of cohorts |
+ |
Cohort TablesFunctions that support creating the necessary cohort tables. |
@@ -101,18 +113,6 @@ Is the data.frame a cohort definition set? |
- |
- Cohort Generation-Functions that support generating cohorts. - |
- ||
- - | -Generate a set of cohorts |
- |
- - | -Create an empty cohort definition set |
|
Cohort CountsFunction for obtaining the counts of subjects and events for one or more cohorts @@ -122,21 +122,13 @@Cohort Counts
Count the cohort(s) | ||
- Cohort Subset-Functions and R6 classes for creating cohort subset definitions and subset operators. +Cohort Subset Functions+Functions for creating cohort subset definitions and subset operators. |
||
Add cohort subset definition to a cohort definition set |
- ||
- - | -Cohort Subset Definition |
- |
- - | -Cohort Subset Operator |
|
@@ -157,38 +149,54 @@ | A definition of subset functions to be applied to a set of cohorts |
- |
- - | -Criteria Subset |
|
Get cohort subset definitions from a cohort definition set |
- ||
- - | -Limit Subset Operator |
|
Save cohort subset definitions to json |
+ ||
+ Cohort Subset Classes+R6 classes for cohort subset definitions and subset operators. + |
+ ||
+ + | +Cohort Subset Definition |
+ |
+ + | +Cohort Subset Operator |
+ |
+ + | +Demographic Subset Operator |
+ |
+ + | +Limit Subset Operator |
|
- | to List |
+ Time Window For Cohort Subset Operator |
- | SubsetOperator |
+ Abstract base class for subsets. |
Cohort Statistics-Functions for inserting inclusion rule names from a cohort definition, exporting the cohort statistics to the file system and a helper function for dropping those tables when they are no longer needed. +Functions for inserting inclusion rule names from a cohort definition, exporting the cohort statistics to the file system and a helper function for dropping those tables when they are no longer needed. These functions assume you are using Circe for inclusion rules and cohort statistics. |
||
Get Cohort Inclusion Stats Table Data |
+ ||
+ + | +Get Cohort Inclusion Rules from a cohort definition set |
|
@@ -213,6 +221,30 @@ | Generate a set of negative control outcome cohorts |
+ |
+ Result Model Management+Functions for managing the results of running Cohort Generator via |
+ ||
+ + | +Create the results data model tables on a database server. |
+ |
+ + | +Get database migrations instance |
+ |
+ + | +Get specifications for CohortGenerator results data model |
+ |
+ + | +Migrate Data model |
+ |
+ + | +Upload results to the database server. |
|
CSV File HelpersFunctions for reading and writing CSV files to ensure adherance to the HADES standard when interfacing between R and SQL/File System: https://ohdsi.github.io/Hades/codeStyle.html#Interfacing_between_R_and_SQL diff --git a/docs/reference/insertInclusionRuleNames.html b/docs/reference/insertInclusionRuleNames.html index 4e47142..2e7edc1 100644 --- a/docs/reference/insertInclusionRuleNames.html +++ b/docs/reference/insertInclusionRuleNames.html @@ -21,7 +21,7 @@ diff --git a/docs/reference/isCamelCase.html b/docs/reference/isCamelCase.html index 6c716f1..180c5bd 100644 --- a/docs/reference/isCamelCase.html +++ b/docs/reference/isCamelCase.html @@ -18,7 +18,7 @@ diff --git a/docs/reference/isCohortDefinitionSet.html b/docs/reference/isCohortDefinitionSet.html index 310f356..fdcd446 100644 --- a/docs/reference/isCohortDefinitionSet.html +++ b/docs/reference/isCohortDefinitionSet.html @@ -18,7 +18,7 @@ diff --git a/docs/reference/isFormattedForDatabaseUpload.html b/docs/reference/isFormattedForDatabaseUpload.html index 0bc0021..f8b9a68 100644 --- a/docs/reference/isFormattedForDatabaseUpload.html +++ b/docs/reference/isFormattedForDatabaseUpload.html @@ -18,7 +18,7 @@ diff --git a/docs/reference/isSnakeCase.html b/docs/reference/isSnakeCase.html index 08554da..985e571 100644 --- a/docs/reference/isSnakeCase.html +++ b/docs/reference/isSnakeCase.html @@ -18,7 +18,7 @@ diff --git a/docs/reference/isTaskRequired.html b/docs/reference/isTaskRequired.html index 3a2dfa3..75332da 100644 --- a/docs/reference/isTaskRequired.html +++ b/docs/reference/isTaskRequired.html @@ -19,7 +19,7 @@ diff --git a/docs/reference/migrateDataModel.html b/docs/reference/migrateDataModel.html new file mode 100644 index 0000000..772c00e --- /dev/null +++ b/docs/reference/migrateDataModel.html @@ -0,0 +1,120 @@ + +
+
+
+
+
+
+
+
+
+
diff --git a/docs/reference/readCsv.html b/docs/reference/readCsv.html
index cee3cec..42ec1e1 100644
--- a/docs/reference/readCsv.html
+++ b/docs/reference/readCsv.html
@@ -21,7 +21,7 @@
@@ -80,7 +80,7 @@
+
+
+
+
+
+
+
+
+
+
+
+ Migrate data from current state to next state +It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +are using a postgres backend) or have kept the csv/zip files from your data generation. +
+
+
+
+
+
+ Arguments+
Used to read a .csv file
-
@@ -93,6 +93,34 @@
ArgumentsWhen TRUE, raise a warning if column headings in the .csv are not in snake_case format Corresponds to the `col_types` in the `readr::read_csv` function. + One of `NULL`, a [readr::cols()] specification, or + a string. See `vignette("readr")` for more details. +If `NULL`, all column types will be inferred from `guess_max` rows of the + input, interspersed throughout the file. This is convenient (and fast), + but not robust. If the guessed types are wrong, you'll need to increase + `guess_max` or supply the correct types yourself. +Column specifications created by [list()] or [cols()] must contain + one column specification for each column. +Alternatively, you can use a compact string representation where each + character represents one column: + - c = character + - i = integer + - n = number + - d = double + - l = logical + - f = factor + - D = date + - T = date time + - t = time + - ? = guess + - _ or - = skip +By default, reading a file without a column specification will print a + message showing what `readr` guessed they were. To remove this message, + set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`. Valuediff --git a/docs/reference/recordTasksDone.html b/docs/reference/recordTasksDone.html index b934c79..056c5aa 100644 --- a/docs/reference/recordTasksDone.html +++ b/docs/reference/recordTasksDone.html @@ -17,7 +17,7 @@
+
+
+
+
+
+
+
+
+
diff --git a/docs/reference/sampleCohortDefinitionSet.html b/docs/reference/sampleCohortDefinitionSet.html
index 41c171e..c6566eb 100644
--- a/docs/reference/sampleCohortDefinitionSet.html
+++ b/docs/reference/sampleCohortDefinitionSet.html
@@ -24,7 +24,7 @@
diff --git a/docs/reference/saveCohortDefinitionSet.html b/docs/reference/saveCohortDefinitionSet.html
index 22c8814..94e3d22 100644
--- a/docs/reference/saveCohortDefinitionSet.html
+++ b/docs/reference/saveCohortDefinitionSet.html
@@ -23,7 +23,7 @@
diff --git a/docs/reference/saveCohortSubsetDefinition.html b/docs/reference/saveCohortSubsetDefinition.html
index d2291c1..98862b5 100644
--- a/docs/reference/saveCohortSubsetDefinition.html
+++ b/docs/reference/saveCohortSubsetDefinition.html
@@ -17,7 +17,7 @@
@@ -81,7 +81,7 @@
+
+
+
+
+
+
+
+
+
+ Run a cohort generation and export results+ Source:R/RunCohortGeneration.R
+ runCohortGeneration.Rd
+
+
+ Run a cohort generation and export results +
+
+
+
+
+ Arguments+
+
+
+ Details+Run a cohort generation for a set of cohorts and negative control outcomes. +This function will also export the results of the run to the `outputFolder`. +Save cohort subset definitions to jsonArguments
+
+
+
+
+
+
+
+
+
diff --git a/docs/reference/writeCsv.html b/docs/reference/writeCsv.html
index 15328a3..f98829e 100644
--- a/docs/reference/writeCsv.html
+++ b/docs/reference/writeCsv.html
@@ -28,7 +28,7 @@
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 3808b80..b810c7d 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -72,6 +72,9 @@
+
+
+
+
+
+
+
+
+
+
+
+ Requires the results data model tables have been created using the
+
+
+
+
+
+ Arguments+
}}
\describe{
diff --git a/man/SubsetOperator.Rd b/man/SubsetOperator.Rd
index 640fa7a..44f7444 100644
--- a/man/SubsetOperator.Rd
+++ b/man/SubsetOperator.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/Subsets.R
\name{SubsetOperator}
\alias{SubsetOperator}
-\title{SubsetOperator}
+\title{Abstract base class for subsets.}
\description{
Abstract Base Class for subsets. Subsets should inherit from this and implement their own requirements.
}
diff --git a/man/createCohortSubset.Rd b/man/createCohortSubset.Rd
index 7c3730f..d2250e6 100644
--- a/man/createCohortSubset.Rd
+++ b/man/createCohortSubset.Rd
@@ -21,7 +21,7 @@ createCohortSubset(
\item{cohortCombinationOperator}{"any" or "all" if using more than one cohort id allow a subject to be in any cohort
or require that they are in all cohorts in specified windows}
-\item{negate}{The opposite of this definition - include patients who do NOT meet the specified criteria (NOT YET IMPLEMENTED)}
+\item{negate}{The opposite of this definition - include patients who do NOT meet the specified criteria}
\item{startWindow}{A SubsetCohortWindow that patients must fall inside (see createSubsetCohortWindow)}
diff --git a/man/createResultsDataModel.Rd b/man/createResultsDataModel.Rd
new file mode 100644
index 0000000..105eb23
--- /dev/null
+++ b/man/createResultsDataModel.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResultsDataModel.R
+\name{createResultsDataModel}
+\alias{createResultsDataModel}
+\title{Create the results data model tables on a database server.}
+\usage{
+createResultsDataModel(
+ connectionDetails = NULL,
+ databaseSchema,
+ tablePrefix = ""
+)
+}
+\arguments{
+\item{connectionDetails}{DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails]}
+
+\item{databaseSchema}{The schema on the server where the tables will be created.}
+
+\item{tablePrefix}{(Optional) string to insert before table names for database table names}
+}
+\description{
+Create the results data model tables on a database server.
+}
+\details{
+Only PostgreSQL and SQLite servers are supported.
+}
diff --git a/man/exportCohortStatsTables.Rd b/man/exportCohortStatsTables.Rd
index e4531fe..9791697 100644
--- a/man/exportCohortStatsTables.Rd
+++ b/man/exportCohortStatsTables.Rd
@@ -13,7 +13,9 @@ exportCohortStatsTables(
snakeCaseToCamelCase = TRUE,
fileNamesInSnakeCase = FALSE,
incremental = FALSE,
- databaseId = NULL
+ databaseId = NULL,
+ cohortDefinitionSet = NULL,
+ tablePrefix = ""
)
}
\arguments{
@@ -48,9 +50,27 @@ overwriting an existing results}
\item{databaseId}{Optional - when specified, the databaseId will be added
to the exported results}
+
+\item{cohortDefinitionSet}{The \code{cohortDefinitionSet} argument must be a data frame with
+the following columns: \describe{
+\item{cohortId}{The unique integer identifier of the cohort}
+\item{cohortName}{The cohort's name}
+\item{sql}{The OHDSI-SQL used to generate the cohort}}
+Optionally, this data frame may contain: \describe{
+\item{json}{The Circe JSON representation of the cohort}}}
+
+\item{tablePrefix}{Optional - allows to append a prefix to the exported
+file names.}
}
\description{
This function retrieves the data from the cohort statistics tables and
writes them to the inclusion statistics folder specified in the function
-call.
+call. NOTE: inclusion rule names are handled in one of two ways:
+
+1. You can specify the cohortDefinitionSet parameter and the inclusion rule
+names will be extracted from the data.frame.
+2. You can insert the inclusion rule names into the database using the
+insertInclusionRuleNames function of this package.
+
+The first approach is preferred as to avoid the warning emitted.
}
diff --git a/man/getCohortInclusionRules.Rd b/man/getCohortInclusionRules.Rd
new file mode 100644
index 0000000..cfb9589
--- /dev/null
+++ b/man/getCohortInclusionRules.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CohortStats.R
+\name{getCohortInclusionRules}
+\alias{getCohortInclusionRules}
+\title{Get Cohort Inclusion Rules from a cohort definition set}
+\usage{
+getCohortInclusionRules(cohortDefinitionSet)
+}
+\arguments{
+\item{cohortDefinitionSet}{The \code{cohortDefinitionSet} argument must be a data frame with
+the following columns: \describe{
+\item{cohortId}{The unique integer identifier of the cohort}
+\item{cohortName}{The cohort's name}
+\item{sql}{The OHDSI-SQL used to generate the cohort}}
+Optionally, this data frame may contain: \describe{
+\item{json}{The Circe JSON representation of the cohort}}}
+}
+\description{
+This function returns a data frame of the inclusion rules defined
+in a cohort definition set.
+}
diff --git a/man/getDataMigrator.Rd b/man/getDataMigrator.Rd
new file mode 100644
index 0000000..dfd2758
--- /dev/null
+++ b/man/getDataMigrator.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResultsDataModel.R
+\name{getDataMigrator}
+\alias{getDataMigrator}
+\title{Get database migrations instance}
+\usage{
+getDataMigrator(connectionDetails, databaseSchema, tablePrefix = "")
+}
+\arguments{
+\item{connectionDetails}{DatabaseConnector connection details object}
+
+\item{databaseSchema}{String schema where database schema lives}
+
+\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cg_")}
+}
+\value{
+Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models
+}
+\description{
+Returns ResultModelManager DataMigrationsManager instance.
+}
diff --git a/man/getResultsDataModelSpecifications.Rd b/man/getResultsDataModelSpecifications.Rd
new file mode 100644
index 0000000..b528db2
--- /dev/null
+++ b/man/getResultsDataModelSpecifications.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResultsDataModel.R
+\name{getResultsDataModelSpecifications}
+\alias{getResultsDataModelSpecifications}
+\title{Get specifications for CohortGenerator results data model}
+\usage{
+getResultsDataModelSpecifications()
+}
+\value{
+A tibble data frame object with specifications
+}
+\description{
+Get specifications for CohortGenerator results data model
+}
diff --git a/man/migrateDataModel.Rd b/man/migrateDataModel.Rd
new file mode 100644
index 0000000..a23acdf
--- /dev/null
+++ b/man/migrateDataModel.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResultsDataModel.R
+\name{migrateDataModel}
+\alias{migrateDataModel}
+\title{Migrate Data model}
+\usage{
+migrateDataModel(connectionDetails, databaseSchema, tablePrefix = "")
+}
+\arguments{
+\item{connectionDetails}{DatabaseConnector connection details object}
+
+\item{databaseSchema}{String schema where database schema lives}
+
+\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cg_")}
+}
+\description{
+Migrate data from current state to next state
+
+It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you
+are using a postgres backend) or have kept the csv/zip files from your data generation.
+}
diff --git a/man/readCsv.Rd b/man/readCsv.Rd
index 8497b54..9eac7ea 100644
--- a/man/readCsv.Rd
+++ b/man/readCsv.Rd
@@ -4,13 +4,43 @@
\alias{readCsv}
\title{Used to read a .csv file}
\usage{
-readCsv(file, warnOnCaseMismatch = TRUE)
+readCsv(file, warnOnCaseMismatch = TRUE, colTypes = readr::cols())
}
\arguments{
\item{file}{The .csv file to read.}
\item{warnOnCaseMismatch}{When TRUE, raise a warning if column headings
in the .csv are not in snake_case format}
+
+\item{colTypes}{Corresponds to the `col_types` in the `readr::read_csv` function.
+ One of `NULL`, a [readr::cols()] specification, or
+ a string. See `vignette("readr")` for more details.
+
+ If `NULL`, all column types will be inferred from `guess_max` rows of the
+ input, interspersed throughout the file. This is convenient (and fast),
+ but not robust. If the guessed types are wrong, you'll need to increase
+ `guess_max` or supply the correct types yourself.
+
+ Column specifications created by [list()] or [cols()] must contain
+ one column specification for each column.
+
+ Alternatively, you can use a compact string representation where each
+ character represents one column:
+ - c = character
+ - i = integer
+ - n = number
+ - d = double
+ - l = logical
+ - f = factor
+ - D = date
+ - T = date time
+ - t = time
+ - ? = guess
+ - _ or - = skip
+
+ By default, reading a file without a column specification will print a
+ message showing what `readr` guessed they were. To remove this message,
+ set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`.}
}
\value{
A tibble with the .csv contents
diff --git a/man/runCohortGeneration.Rd b/man/runCohortGeneration.Rd
new file mode 100644
index 0000000..7781f28
--- /dev/null
+++ b/man/runCohortGeneration.Rd
@@ -0,0 +1,95 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RunCohortGeneration.R
+\name{runCohortGeneration}
+\alias{runCohortGeneration}
+\title{Run a cohort generation and export results}
+\usage{
+runCohortGeneration(
+ connectionDetails,
+ cdmDatabaseSchema,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+ cohortDatabaseSchema = cdmDatabaseSchema,
+ cohortTableNames = getCohortTableNames(),
+ cohortDefinitionSet = NULL,
+ negativeControlOutcomeCohortSet = NULL,
+ occurrenceType = "all",
+ detectOnDescendants = FALSE,
+ stopOnError = TRUE,
+ outputFolder,
+ databaseId = 1,
+ incremental = FALSE,
+ incrementalFolder = NULL
+)
+}
+\arguments{
+\item{connectionDetails}{An object of type \code{connectionDetails} as created using the
+\code{\link[DatabaseConnector]{createConnectionDetails}} function in the
+DatabaseConnector package.}
+
+\item{cdmDatabaseSchema}{Schema name where your patient-level data in OMOP CDM format resides.
+Note that for SQL Server, this should include both the database and
+schema name, for example 'cdm_data.dbo'.}
+
+\item{tempEmulationSchema}{Some database platforms like Oracle and Impala do not truly support
+temp tables. To emulate temp tables, provide a schema with write
+privileges where temp tables can be created.}
+
+\item{cohortDatabaseSchema}{Schema name where your cohort tables reside. Note that for SQL Server,
+this should include both the database and schema name, for example
+'scratch.dbo'.}
+
+\item{cohortTableNames}{The names of the cohort tables. See \code{\link{getCohortTableNames}}
+for more details.}
+
+\item{cohortDefinitionSet}{The \code{cohortDefinitionSet} argument must be a data frame with
+the following columns: \describe{
+\item{cohortId}{The unique integer identifier of the cohort}
+\item{cohortName}{The cohort's name}
+\item{sql}{The OHDSI-SQL used to generate the cohort}}
+Optionally, this data frame may contain: \describe{
+\item{json}{The Circe JSON representation of the cohort}}}
+
+\item{negativeControlOutcomeCohortSet}{The \code{negativeControlOutcomeCohortSet} argument must be a data frame with
+the following columns: \describe{
+\item{cohortId}{The unique integer identifier of the cohort}
+\item{cohortName}{The cohort's name}
+\item{outcomeConceptId}{The concept_id in the condition domain to use for the negative control outcome.}}}
+
+\item{occurrenceType}{For negative controls outcomes, the occurrenceType
+will detect either: the first time an
+outcomeConceptId occurs or all times the
+outcomeConceptId occurs for a person. Values
+accepted: 'all' or 'first'.}
+
+\item{detectOnDescendants}{For negative controls outcomes, when set to TRUE,
+detectOnDescendants will use the vocabulary to
+find negative control outcomes using the
+outcomeConceptId and all descendants via the
+concept_ancestor table. When FALSE, only the exact
+outcomeConceptId will be used to detect the
+outcome.}
+
+\item{stopOnError}{If an error happens while generating one of the
+cohorts in the cohortDefinitionSet, should we
+stop processing the other cohorts? The default is
+TRUE; when set to FALSE, failures will be
+identified in the return value from this function.}
+
+\item{outputFolder}{Name of the folder where all the outputs will written to.}
+
+\item{databaseId}{A unique ID for the database. This will be appended to
+most tables.}
+
+\item{incremental}{Create only cohorts that haven't been created before?}
+
+\item{incrementalFolder}{If \code{incremental = TRUE}, specify a folder where
+records are kept of which definition has been
+executed.}
+}
+\description{
+Run a cohort generation and export results
+}
+\details{
+Run a cohort generation for a set of cohorts and negative control outcomes.
+This function will also export the results of the run to the `outputFolder`.
+}
diff --git a/man/saveCohortSubsetDefinition.Rd b/man/saveCohortSubsetDefinition.Rd
index 855f25d..ce8d487 100644
--- a/man/saveCohortSubsetDefinition.Rd
+++ b/man/saveCohortSubsetDefinition.Rd
@@ -10,7 +10,7 @@ saveCohortSubsetDefinition(
)
}
\arguments{
-\item{subsetDefinition}{The subset definition object {@seealso CohortSubsetDefinition}}
+\item{subsetDefinition}{The subset definition object @seealso[CohortSubsetDefinition]}
\item{subsetJsonFolder}{Defines the folder to store the subset JSON}
}
diff --git a/man/uploadResults.Rd b/man/uploadResults.Rd
new file mode 100644
index 0000000..06700b2
--- /dev/null
+++ b/man/uploadResults.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResultsDataModel.R
+\name{uploadResults}
+\alias{uploadResults}
+\title{Upload results to the database server.}
+\usage{
+uploadResults(
+ connectionDetails,
+ schema,
+ resultsFolder,
+ forceOverWriteOfSpecifications = FALSE,
+ purgeSiteDataBeforeUploading = TRUE,
+ tablePrefix = "",
+ ...
+)
+}
+\arguments{
+\item{connectionDetails}{An object of type \code{connectionDetails} as created using the
+\code{\link[DatabaseConnector]{createConnectionDetails}} function in the
+DatabaseConnector package.}
+
+\item{schema}{The schema on the server where the tables have been created.}
+
+\item{resultsFolder}{The folder holding the results in .csv files}
+
+\item{forceOverWriteOfSpecifications}{If TRUE, specifications of the phenotypes, cohort definitions, and analysis
+will be overwritten if they already exist on the database. Only use this if these specifications
+have changed since the last upload.}
+
+\item{purgeSiteDataBeforeUploading}{If TRUE, before inserting data for a specific databaseId all the data for
+that site will be dropped. This assumes the resultsFolder file contains the full data for that
+data site.}
+
+\item{tablePrefix}{(Optional) string to insert before table names for database table names}
+
+\item{...}{See ResultModelManager::uploadResults}
+}
+\description{
+Requires the results data model tables have been created using the \code{\link{createResultsDataModel}} function.
+}
diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R
index 122d3b9..1cda5b4 100644
--- a/tests/testthat/helper.R
+++ b/tests/testthat/helper.R
@@ -84,49 +84,104 @@ getPlatformConnectionDetails <- function(dbmsPlatform) {
options("sqlRenderTempEmulationSchema" = NULL)
cohortTable <- "cohort"
} else {
- if (dbmsPlatform == "postgresql") {
- dbUser <- Sys.getenv("CDM5_POSTGRESQL_USER")
- dbPassword <- Sys.getenv("CDM5_POSTGRESQL_PASSWORD")
- dbServer <- Sys.getenv("CDM5_POSTGRESQL_SERVER")
- cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA")
- vocabularyDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA")
- cohortDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA")
+ if (dbmsPlatform == "bigquery") {
+ # To avoid rate limit on BigQuery, only test on 1 OS:
+ if (.Platform$OS.type == "windows") {
+ bqKeyFile <- tempfile(fileext = ".json")
+ writeLines(Sys.getenv("CDM_BIG_QUERY_KEY_FILE"), bqKeyFile)
+ if (testthat::is_testing()) {
+ withr::defer(unlink(bqKeyFile, force = TRUE), testthat::teardown_env())
+ }
+ bqConnectionString <- gsub(
+ " |