diff --git a/.Rbuildignore b/.Rbuildignore index e91063e..2642258 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,9 @@ ^\.Rproj\.user$ ^\.idea$ ^\.github$ +_pkgdown\.yml +compare_versions +deploy.sh +docs +extras +man-roxygen diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index 9a5fe40..96923df 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -20,9 +20,6 @@ jobs: fail-fast: false matrix: config: - - {os: windows-latest, r: '4.2.3', rtools: '42', rspm: "https://cloud.r-project.org"} - - {os: macOS-latest, r: '4.2.3', rtools: '42', rspm: "https://cloud.r-project.org"} - - {os: ubuntu-20.04, r: '4.2.3', rtools: '42', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - {os: windows-latest, r: 'release', rtools: '', rspm: "https://cloud.r-project.org"} - {os: macOS-latest, r: 'release', rtools: '', rspm: "https://cloud.r-project.org"} - {os: ubuntu-20.04, r: 'release', rtools: '', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} @@ -50,9 +47,23 @@ jobs: CDM5_REDSHIFT_PASSWORD: ${{ secrets.CDM5_REDSHIFT_PASSWORD }} CDM5_REDSHIFT_SERVER: ${{ secrets.CDM5_REDSHIFT_SERVER }} CDM5_REDSHIFT_USER: ${{ secrets.CDM5_REDSHIFT_USER }} + CDM_SNOWFLAKE_CDM53_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_CDM53_SCHEMA }} + CDM_SNOWFLAKE_OHDSI_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_OHDSI_SCHEMA }} + CDM_SNOWFLAKE_PASSWORD: ${{ secrets.CDM_SNOWFLAKE_PASSWORD }} + CDM_SNOWFLAKE_CONNECTION_STRING: ${{ secrets.CDM_SNOWFLAKE_CONNECTION_STRING }} + CDM_SNOWFLAKE_USER: ${{ secrets.CDM_SNOWFLAKE_USER }} + CDM5_SPARK_USER: ${{ secrets.CDM5_SPARK_USER }} + CDM5_SPARK_PASSWORD: ${{ secrets.CDM5_SPARK_PASSWORD }} + CDM5_SPARK_CONNECTION_STRING: ${{ secrets.CDM5_SPARK_CONNECTION_STRING }} + CDM5_SPARK_CDM_SCHEMA: ${{ secrets.CDM5_SPARK_CDM_SCHEMA }} + CDM5_SPARK_OHDSI_SCHEMA: ${{ secrets.CDM5_SPARK_OHDSI_SCHEMA }} + CDM_BIG_QUERY_CONNECTION_STRING: ${{ secrets.CDM_BIG_QUERY_CONNECTION_STRING }} + CDM_BIG_QUERY_KEY_FILE: ${{ secrets.CDM_BIG_QUERY_KEY_FILE }} + CDM_BIG_QUERY_CDM_SCHEMA: ${{ secrets.CDM_BIG_QUERY_CDM_SCHEMA }} + CDM_BIG_QUERY_OHDSI_SCHEMA: ${{ secrets.CDM_BIG_QUERY_OHDSI_SCHEMA }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -91,10 +102,13 @@ jobs: eval sudo $cmd done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - - name: Reconfigure Java + - name: Setup Java if: runner.os == 'macOS' - run: R CMD javareconf - + uses: actions/setup-java@v4 + with: + distribution: 'corretto' + java-version: '8' + - name: Install libssh if: runner.os == 'Linux' run: | @@ -102,6 +116,7 @@ jobs: - name: Install dependencies run: | + install.packages("cachem") remotes::install_deps(dependencies = TRUE, INSTALL_opts=c("--no-multiarch")) remotes::install_cran("rcmdcheck") shell: Rscript {0} diff --git a/.github/workflows/R_CMD_check_main_weekly.yaml b/.github/workflows/R_CMD_check_main_weekly.yaml index ecf43ff..30c8dea 100644 --- a/.github/workflows/R_CMD_check_main_weekly.yaml +++ b/.github/workflows/R_CMD_check_main_weekly.yaml @@ -20,30 +20,41 @@ jobs: GITHUB_PAT: ${{ secrets.GH_TOKEN }} R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} - CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM_SCHEMA }} + CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM54_SCHEMA }} CDM5_ORACLE_OHDSI_SCHEMA: ${{ secrets.CDM5_ORACLE_OHDSI_SCHEMA }} CDM5_ORACLE_PASSWORD: ${{ secrets.CDM5_ORACLE_PASSWORD }} CDM5_ORACLE_SERVER: ${{ secrets.CDM5_ORACLE_SERVER }} CDM5_ORACLE_USER: ${{ secrets.CDM5_ORACLE_USER }} - CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM_SCHEMA }} + CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM54_SCHEMA }} CDM5_POSTGRESQL_OHDSI_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_OHDSI_SCHEMA }} CDM5_POSTGRESQL_PASSWORD: ${{ secrets.CDM5_POSTGRESQL_PASSWORD }} CDM5_POSTGRESQL_SERVER: ${{ secrets.CDM5_POSTGRESQL_SERVER }} CDM5_POSTGRESQL_USER: ${{ secrets.CDM5_POSTGRESQL_USER }} - CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM_SCHEMA }} + CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM54_SCHEMA }} CDM5_SQL_SERVER_OHDSI_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_OHDSI_SCHEMA }} CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }} CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }} CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }} - CDM5_REDSHIFT_CDM_SCHEMA: ${{ secrets.CDM5_REDSHIFT_CDM_SCHEMA }} + CDM5_REDSHIFT_CDM_SCHEMA: ${{ secrets.CDM5_REDSHIFT_CDM54_SCHEMA }} CDM5_REDSHIFT_OHDSI_SCHEMA: ${{ secrets.CDM5_REDSHIFT_OHDSI_SCHEMA }} CDM5_REDSHIFT_PASSWORD: ${{ secrets.CDM5_REDSHIFT_PASSWORD }} CDM5_REDSHIFT_SERVER: ${{ secrets.CDM5_REDSHIFT_SERVER }} CDM5_REDSHIFT_USER: ${{ secrets.CDM5_REDSHIFT_USER }} + CDM_SNOWFLAKE_CDM53_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_CDM53_SCHEMA }} + CDM_SNOWFLAKE_OHDSI_SCHEMA: ${{ secrets.CDM_SNOWFLAKE_OHDSI_SCHEMA }} + CDM_SNOWFLAKE_PASSWORD: ${{ secrets.CDM_SNOWFLAKE_PASSWORD }} + CDM_SNOWFLAKE_CONNECTION_STRING: ${{ secrets.CDM_SNOWFLAKE_CONNECTION_STRING }} + CDM_SNOWFLAKE_USER: ${{ secrets.CDM_SNOWFLAKE_USER }} CDM5_SPARK_USER: ${{ secrets.CDM5_SPARK_USER }} CDM5_SPARK_PASSWORD: ${{ secrets.CDM5_SPARK_PASSWORD }} CDM5_SPARK_CONNECTION_STRING: ${{ secrets.CDM5_SPARK_CONNECTION_STRING }} - + CDM5_SPARK_CDM_SCHEMA: ${{ secrets.CDM5_SPARK_CDM_SCHEMA }} + CDM5_SPARK_OHDSI_SCHEMA: ${{ secrets.CDM5_SPARK_OHDSI_SCHEMA }} + CDM_BIG_QUERY_CONNECTION_STRING: ${{ secrets.CDM_BIG_QUERY_CONNECTION_STRING }} + CDM_BIG_QUERY_KEY_FILE: ${{ secrets.CDM_BIG_QUERY_KEY_FILE }} + CDM_BIG_QUERY_CDM_SCHEMA: ${{ secrets.CDM_BIG_QUERY_CDM_SCHEMA }} + CDM_BIG_QUERY_OHDSI_SCHEMA: ${{ secrets.CDM_BIG_QUERY_OHDSI_SCHEMA }} + steps: - uses: actions/checkout@v2 diff --git a/DESCRIPTION b/DESCRIPTION index dd768aa..7882e2c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: CohortGenerator Type: Package Title: An R Package for Cohort Generation Against the OMOP CDM -Version: 0.9.0 -Date: 2024-05-28 +Version: 0.10.0 +Date: 2024-07-14 Authors@R: c( person("Anthony", "Sena", email = "sena@ohdsi.org", role = c("aut", "cre")), person("Jamie", "Gilbert", role = c("aut")), @@ -22,13 +22,16 @@ Imports: digest, dplyr, lubridate, + methods, ParallelLogger (>= 3.0.0), readr (>= 2.1.0), rlang, RJSONIO, jsonlite, + ResultModelManager, SqlRender (>= 1.11.1), - stringi (>= 1.7.6) + stringi (>= 1.7.6), + tibble Suggests: CirceR (>= 1.1.1), Eunomia, @@ -36,11 +39,11 @@ Suggests: rmarkdown, ROhdsiWebApi, testthat, - withr + withr, + zip Remotes: - ohdsi/CirceR, - ohdsi/Eunomia, - ohdsi/ROhdsiWebApi + ohdsi/ResultModelManager, + ohdsi/ROhdsiWebApi, License: Apache License VignetteBuilder: knitr URL: https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator diff --git a/NAMESPACE b/NAMESPACE index becbe95..62124bb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(createDemographicSubset) export(createEmptyCohortDefinitionSet) export(createEmptyNegativeControlOutcomeCohortSet) export(createLimitSubset) +export(createResultsDataModel) export(createSubsetCohortWindow) export(dropCohortStatsTables) export(exportCohortStatsTables) @@ -23,9 +24,12 @@ export(generateCohortSet) export(generateNegativeControlOutcomeCohorts) export(getCohortCounts) export(getCohortDefinitionSet) +export(getCohortInclusionRules) export(getCohortStats) export(getCohortTableNames) +export(getDataMigrator) export(getRequiredTasks) +export(getResultsDataModelSpecifications) export(getSubsetDefinitions) export(insertInclusionRuleNames) export(isCamelCase) @@ -33,17 +37,21 @@ export(isCohortDefinitionSet) export(isFormattedForDatabaseUpload) export(isSnakeCase) export(isTaskRequired) +export(migrateDataModel) export(readCsv) export(recordTasksDone) +export(runCohortGeneration) export(sampleCohortDefinitionSet) export(saveCohortDefinitionSet) export(saveCohortSubsetDefinition) export(saveIncremental) +export(uploadResults) export(writeCsv) import(DatabaseConnector) import(R6) -importFrom(dplyr,"%>%") +import(dplyr) importFrom(grDevices,rgb) +importFrom(methods,is) importFrom(rlang,':=') importFrom(rlang,.data) importFrom(stats,aggregate) diff --git a/NEWS.md b/NEWS.md index d68e06c..d81818c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +CohortGenerator 0.10.0 +======================= +New Features +- Add `runCohortGeneration` function (Issue #165) +- Adopt ResultModelManager for handling results data models & uploading. Extend results data model to include information on cohort subsets(#154, #162) +- Remove REMOTES entries for CirceR and Eunomia which are now in CRAN (#145) +- Unit tests now running on all OHDSI DB Platforms (#151) + +Bug Fixes +- Negation of cohort subset operator must join on `subject_id` AND `start_date` (#167) +- Allow integer as cohort ID (#146) +- Use native messaging functions for output vs. ParallelLogger (#97) +- Prevent upload of inclusion rule information (#78) +- Expose `colTypes` when working with .csv files (#59) +- Remove `bit64` from package (mostly) (#152) +- Updated documentation for cohort subset negate feature (#111) + CohortGenerator 0.9.0 ======================= - Random sample functionality (for development only) (Issue #129) diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R index 2dfb86a..e27d222 100644 --- a/R/CohortConstruction.R +++ b/R/CohortConstruction.R @@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL, "sql" ) ) + assertLargeInteger(cohortDefinitionSet$cohortId) # Verify that cohort IDs are not repeated in the cohort definition # set before generating if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) { @@ -249,7 +250,7 @@ generateCohort <- function(cohortId = NULL, connection <- DatabaseConnector::connect(connectionDetails) on.exit(DatabaseConnector::disconnect(connection)) } - ParallelLogger::logInfo(i, "/", nrow(cohortDefinitionSet), "- Generating cohort: ", cohortName, " (id = ", cohortId, ")") + rlang::inform(paste0(i, "/", nrow(cohortDefinitionSet), "- Generating cohort: ", cohortName, " (id = ", cohortId, ")")) sql <- cohortDefinitionSet$sql[i] if (!isSubset) { diff --git a/R/CohortCount.R b/R/CohortCount.R index f14244f..7adad6c 100644 --- a/R/CohortCount.R +++ b/R/CohortCount.R @@ -67,7 +67,7 @@ getCohortCounts <- function(connectionDetails = NULL, if (tolower(cohortTable) %in% tablesInServer) { counts <- DatabaseConnector::querySql(connection, sql, snakeCaseToCamelCase = TRUE) delta <- Sys.time() - start - ParallelLogger::logInfo(paste("Counting cohorts took", signif(delta, 3), attr(delta, "units"))) + rlang::inform(paste("Counting cohorts took", signif(delta, 3), attr(delta, "units"))) if (!is.null(cohortDefinitionSet)) { # If the user has NOT specified a list of cohortIds # to use to filter the cohortDefinitionSet, then @@ -77,15 +77,17 @@ getCohortCounts <- function(connectionDetails = NULL, } counts <- merge( x = counts, - y = cohortDefinitionSet[cohortDefinitionSet$cohortId %in% cohortIds, ], + y = cohortDefinitionSet[cohortDefinitionSet$cohortId %in% cohortIds, , drop = FALSE], by = "cohortId", all.y = TRUE ) - counts <- transform( - counts, - cohortEntries = ifelse(is.na(cohortEntries), 0L, cohortEntries), - cohortSubjects = ifelse(is.na(cohortSubjects), 0L, cohortSubjects) - ) + counts <- with(counts, { + transform( + counts, + cohortEntries = ifelse(is.na(cohortEntries), 0L, cohortEntries), + cohortSubjects = ifelse(is.na(cohortSubjects), 0L, cohortSubjects) + ) + }) } if (!is.null(databaseId)) { counts$databaseId <- databaseId diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R index 7b09547..4404951 100644 --- a/R/CohortDefinitionSet.R +++ b/R/CohortDefinitionSet.R @@ -29,15 +29,41 @@ #' @export createEmptyCohortDefinitionSet <- function(verbose = FALSE) { checkmate::assert_logical(verbose) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() + df <- data.frame( + cohortId = numeric(), + cohortName = character(), + sql = character(), + json = character() + ) if (verbose) { - print(cohortDefinitionSetSpec) + print(df) } - # Build the data.frame dynamically from the cohort definition set spec - df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec) invisible(df) } +.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) { + checkmate::assert_data_frame(x) + df <- createEmptyCohortDefinitionSet(verbose = FALSE) + + # Compare the column names from the input x to an empty cohort + # definition set to ensure the required columns are present + cohortDefinitionSetColumns <- colnames(df) + matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns) + columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns) + + if (!columnNamesMatch && emitWarning) { + columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x)) + warningMessage <- paste0( + "The following columns were missing in your cohortDefinitionSet: ", + paste(columnsMissing, collapse = ","), + ". A cohortDefinitionSet requires the following columns: ", + paste(cohortDefinitionSetColumns, collapse = ",") + ) + warning(warningMessage) + } + invisible(columnNamesMatch) +} + #' Is the data.frame a cohort definition set? #' #' @description @@ -99,7 +125,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit checkmate::assert_data_frame(x) df <- createEmptyCohortDefinitionSet(verbose = FALSE) cohortDefinitionSetColumns <- colnames(df) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning) if (!columnNamesMatch) { @@ -107,7 +132,8 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit } # Compare the data types from the input x to an empty cohort - # definition set to ensure the same data types are present + # definition set to ensure the same data types (or close enough) + # are present dataTypesMatch <- FALSE # Subset x to the required columns xSubset <- x[, cohortDefinitionSetColumns] @@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit # Get the reference data types cohortDefinitionSetDataTypes <- sapply(df, typeof) # Check if the data types match - dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes) + # NOTE: createEmptyCohortDefinitionSet() is the reference for the data + # types. cohortId is declared as a numeric but an integer is also fine + dataTypesMatch <- (xDataTypes[1] %in% c("integer", "double") && all(xDataTypes[2:4] == "character")) + # Create the cohortDefinitionSetSpec from the names/data types for reference + cohortDefinitionSetSpec <- data.frame( + columnName = names(xDataTypes), + dataType = xDataTypes + ) if (!dataTypesMatch && emitWarning) { dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes) # Create a column for the warning message @@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit )) } -.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) { - checkmate::assert_data_frame(x) - df <- createEmptyCohortDefinitionSet(verbose = FALSE) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() - - # Compare the column names from the input x to an empty cohort - # definition set to ensure the required columns are present - cohortDefinitionSetColumns <- colnames(df) - matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns) - columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns) - - if (!columnNamesMatch && emitWarning) { - columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x)) - warningMessage <- paste0( - "The following columns were missing in your cohortDefinitionSet: ", - paste(columnsMissing, collapse = ","), - ". A cohortDefinitionSet requires the following columns: ", - paste(cohortDefinitionSetColumns, collapse = ",") - ) - warning(warningMessage) - } - invisible(columnNamesMatch) -} - -#' Helper function to return the specification description of a -#' cohortDefinitionSet -#' -#' @description -#' This function reads from the cohortDefinitionSetSpecificationDescription.csv -#' to return a data.frame that describes the required columns in a -#' cohortDefinitionSet -#' -#' @return -#' Returns a data.frame that defines a cohortDefinitionSet -#' -#' @noRd -#' @keywords internal -.getCohortDefinitionSetSpecification <- function() { - return(readCsv(system.file("cohortDefinitionSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE - ))) -} - #' Get a cohort definition set #' #' @description @@ -244,7 +233,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", path <- system.file(fileName, package = packageName) } if (verbose) { - ParallelLogger::logInfo(paste0(" -- Loading ", basename(fileName), " from ", path)) + rlang::inform(paste0(" -- Loading ", basename(fileName), " from ", path)) } if (!file.exists(path)) { if (grepl(".json$", tolower(basename(fileName))) && warnOnMissingJson) { @@ -259,10 +248,10 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", } # Read the settings file which holds the cohortDefinitionSet - ParallelLogger::logInfo("Loading cohortDefinitionSet") + rlang::inform("Loading cohortDefinitionSet") settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE) - assert_settings_columns(names(settings), getPath(fileName = settingsFileName)) + assertSettingsColumns(names(settings), getPath(fileName = settingsFileName)) checkmate::assert_true(all(cohortFileNameValue %in% names(settings))) checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings)))) @@ -313,12 +302,12 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", # Loading cohort subset definitions with their associated targets if (loadSubsets & nrow(subsetsToLoad) > 0) { if (dir.exists(subsetJsonFolder)) { - ParallelLogger::logInfo("Loading Cohort Subset Definitions") + rlang::inform("Loading Cohort Subset Definitions") ## Loading subsets that apply to the saved definition sets for (i in unique(subsetsToLoad$subsetDefinitionId)) { subsetFile <- file.path(subsetJsonFolder, paste0(i, ".json")) - ParallelLogger::logInfo("Loading Cohort Subset Defintion ", subsetFile) + rlang::inform(paste0("Loading Cohort Subset Defintion ", subsetFile)) subsetDef <- CohortSubsetDefinition$new(ParallelLogger::loadSettingsFromJson(subsetFile)) # Find target cohorts for this subset definition subsetTargetIds <- unique(subsetsToLoad[subsetsToLoad$subsetDefinitionId == i, ]$subsetParent) @@ -382,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") checkmate::assert_vector(cohortFileNameValue) checkmate::assert_true(length(cohortFileNameValue) > 0) - assert_settings_columns(names(cohortDefinitionSet)) + assertSettingsColumns(names(cohortDefinitionSet)) checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet))) settingsFolder <- dirname(settingsFileName) if (!dir.exists(settingsFolder)) { @@ -397,7 +386,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, # Export the cohortDefinitionSet to the settings folder if (verbose) { - ParallelLogger::logInfo("Exporting cohortDefinitionSet to ", settingsFileName) + rlang::inform(paste0("Exporting cohortDefinitionSet to ", settingsFileName)) } # Write the settings file and ensure that the "sql" and "json" columns are # not included @@ -425,7 +414,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, } if (verbose) { - ParallelLogger::logInfo("Exporting (", i, "/", nrow(cohortDefinitionSet), "): ", cohortName) + rlang::inform(paste0("Exporting (", i, "/", nrow(cohortDefinitionSet), "): ", cohortName)) } if (!is.na(json) && nchar(json) > 0) { @@ -441,7 +430,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, } } - ParallelLogger::logInfo("Cohort definition saved") + rlang::inform("Cohort definition saved") } .getSettingsFileRequiredColumns <- function() { @@ -507,19 +496,37 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) { } } -.createEmptyDataFrameFromSpecification <- function(specifications) { - # Build the data.frame dynamically from the cohort definition set spec - df <- data.frame() - for (i in 1:nrow(specifications)) { - colName <- specifications$columnName[i] - dataType <- specifications$dataType[i] - if (dataType == "integer64") { - df <- df %>% dplyr::mutate(!!colName := do.call(what = bit64::as.integer64, args = list())) - } else { - df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list())) - } +#' Custom checkmate assertion for ensuring a vector contains only integer numbers, +#' including large ones +#' +#' @description +#' This function is used to provide a more informative message to inform +#' a user that their number must be an integer. Since the +#' cohort definition set allows for storing `numeric` data types, we need +#' to make sure that there are no digits in the mantissa of the cohort ID. +#' NOTE: This function is necessary since checkmate::assert_integerish +#' will still throw an error even in the case where you have a large +#' integer which was not desirable. +#' +#' @param x The vector containing integer/numeric values +#' +#' @param columnName The name of the column where this vector came from. This +#' is used when displaying the error message. +#' @return +#' Returns TRUE if all the values in x are integers +#' @noRd +#' @keywords internal +checkLargeInteger <- function(x, columnName = "cohortId") { + # NOTE: suppressWarnings used to mask + # warning from R which may happen for + # large values in X. + res <- all(suppressWarnings(x %% 1) == 0) + if (!isTRUE(res)) { + errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try") + return(errorMessage) + } else { + return(TRUE) } - invisible(df) } .copySubsetDefinitions <- function(copyToCds, copyFromCds) { diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R index b82cdd1..d4588b4 100644 --- a/R/CohortGenerator.R +++ b/R/CohortGenerator.R @@ -19,12 +19,14 @@ #' @import DatabaseConnector #' @import R6 +#' @import dplyr #' @importFrom grDevices rgb +#' @importFrom methods is #' @importFrom stats aggregate setNames #' @importFrom utils write.csv install.packages menu packageVersion sessionInfo -#' @importFrom dplyr "%>%" #' @importFrom rlang .data ':=' NULL # Add custom assertions -assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns) +assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns) +assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger) diff --git a/R/CohortSample.R b/R/CohortSample.R index 6cb393b..e522244 100644 --- a/R/CohortSample.R +++ b/R/CohortSample.R @@ -262,7 +262,7 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet, ) if (nrow(sampleTable) == 0) { - ParallelLogger::logInfo("No entires found for ", targetCohortId, " was it generated?") + rlang::inform(paste0("No entires found for ", targetCohortId, " was it generated?")) return(sampledCohortDefinition) } # Called only for side effects diff --git a/R/CohortStats.R b/R/CohortStats.R index 9b1c2b5..2ddaa84 100644 --- a/R/CohortStats.R +++ b/R/CohortStats.R @@ -47,14 +47,6 @@ insertInclusionRuleNames <- function(connectionDetails = NULL, stop("You must provide either a database connection or the connection details.") } - checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") - checkmate::assertNames(colnames(cohortDefinitionSet), - must.include = c( - "cohortId", - "cohortName", - "json" - ) - ) if (is.null(connection)) { connection <- DatabaseConnector::connect(connectionDetails) on.exit(DatabaseConnector::disconnect(connection)) @@ -65,44 +57,7 @@ insertInclusionRuleNames <- function(connectionDetails = NULL, stop(paste0(cohortInclusionTable, " table not found in schema: ", cohortDatabaseSchema, ". Please make sure the table is created using the createCohortTables() function before calling this function.")) } - # Assemble the cohort inclusion rules - # NOTE: This data frame must match the @cohort_inclusion_table - # structure as defined in inst/sql/sql_server/CreateCohortTables.sql - inclusionRules <- data.frame( - cohortDefinitionId = bit64::integer64(), - ruleSequence = integer(), - name = character(), - description = character() - ) - # Remove any cohort definitions that do not include the JSON property - cohortDefinitionSet <- cohortDefinitionSet[!(is.null(cohortDefinitionSet$json) | is.na(cohortDefinitionSet$json)), ] - for (i in 1:nrow(cohortDefinitionSet)) { - cohortDefinition <- RJSONIO::fromJSON(content = cohortDefinitionSet$json[i], digits = 23) - if (!is.null(cohortDefinition$InclusionRules)) { - nrOfRules <- length(cohortDefinition$InclusionRules) - if (nrOfRules > 0) { - for (j in 1:nrOfRules) { - ruleName <- cohortDefinition$InclusionRules[[j]]$name - ruleDescription <- cohortDefinition$InclusionRules[[j]]$description - if (is.na(ruleName) || ruleName == "") { - ruleName <- paste0("Unamed rule (Sequence ", j - 1, ")") - } - if (is.null(ruleDescription)) { - ruleDescription <- "" - } - inclusionRules <- rbind( - inclusionRules, - data.frame( - cohortDefinitionId = bit64::as.integer64(cohortDefinitionSet$cohortId[i]), - ruleSequence = as.integer(j - 1), - name = ruleName, - description = ruleDescription - ) - ) - } - } - } - } + inclusionRules <- getCohortInclusionRules(cohortDefinitionSet) # Remove any existing data to prevent duplication DatabaseConnector::renderTranslateExecuteSql( @@ -116,7 +71,7 @@ insertInclusionRuleNames <- function(connectionDetails = NULL, # Insert the inclusion rules if (nrow(inclusionRules) > 0) { - ParallelLogger::logInfo("Inserting inclusion rule names") + rlang::inform("Inserting inclusion rule names") DatabaseConnector::insertTable( connection = connection, databaseSchema = cohortDatabaseSchema, @@ -152,7 +107,7 @@ getStatsTable <- function(connectionDetails, databaseId <- NULL } - ParallelLogger::logInfo("- Fetching data from ", table) + rlang::inform(paste0("- Fetching data from ", table)) sql <- "SELECT {@database_id != ''}?{CAST('@database_id' as VARCHAR(255)) as database_id,} t.* FROM @cohort_database_schema.@table t" data <- DatabaseConnector::renderTranslateQuerySql( sql = sql, @@ -174,6 +129,7 @@ getStatsTable <- function(connectionDetails, } #' Get Cohort Inclusion Stats Table Data +#' #' @description #' This function returns a data frame of the data in the Cohort Inclusion Tables. #' Results are organized in to a list with 5 different data frames: @@ -244,3 +200,67 @@ getCohortStats <- function(connectionDetails, } return(results) } + + +#' Get Cohort Inclusion Rules from a cohort definition set +#' +#' @description +#' This function returns a data frame of the inclusion rules defined +#' in a cohort definition set. +#' +#' @md +#' @template CohortDefinitionSet +#' +#' @export +getCohortInclusionRules <- function(cohortDefinitionSet) { + checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") + checkmate::assertNames(colnames(cohortDefinitionSet), + must.include = c( + "cohortId", + "cohortName", + "json" + ) + ) + + # Assemble the cohort inclusion rules + # NOTE: This data frame must match the @cohort_inclusion_table + # structure as defined in inst/sql/sql_server/CreateCohortTables.sql + inclusionRules <- data.frame( + cohortDefinitionId = bit64::integer64(), + ruleSequence = integer(), + name = character(), + description = character() + ) + + # Remove any cohort definitions that do not include the JSON property + cohortDefinitionSet <- cohortDefinitionSet[!(is.null(cohortDefinitionSet$json) | is.na(cohortDefinitionSet$json)), ] + for (i in 1:nrow(cohortDefinitionSet)) { + cohortDefinition <- RJSONIO::fromJSON(content = cohortDefinitionSet$json[i], digits = 23) + if (!is.null(cohortDefinition$InclusionRules)) { + nrOfRules <- length(cohortDefinition$InclusionRules) + if (nrOfRules > 0) { + for (j in 1:nrOfRules) { + ruleName <- cohortDefinition$InclusionRules[[j]]$name + ruleDescription <- cohortDefinition$InclusionRules[[j]]$description + if (is.na(ruleName) || ruleName == "") { + ruleName <- paste0("Unamed rule (Sequence ", j - 1, ")") + } + if (is.null(ruleDescription)) { + ruleDescription <- "" + } + inclusionRules <- rbind( + inclusionRules, + data.frame( + cohortDefinitionId = bit64::as.integer64(cohortDefinitionSet$cohortId[i]), + ruleSequence = as.integer(j - 1), + name = ruleName, + description = ruleDescription + ) + ) + } + } + } + } + + invisible(inclusionRules) +} diff --git a/R/CohortTables.R b/R/CohortTables.R index bf0e2b9..19a57c7 100644 --- a/R/CohortTables.R +++ b/R/CohortTables.R @@ -98,13 +98,13 @@ createCohortTables <- function(connectionDetails = NULL, for (i in 1:length(cohortTableNames)) { if (toupper(cohortTableNames[i]) %in% toupper(tables)) { createTableFlagList[i] <- FALSE - ParallelLogger::logInfo("Table \"", cohortTableNames[i], "\" already exists and in incremental mode, so not recreating it.") + rlang::inform(paste0("Table \"", cohortTableNames[i], "\" already exists and in incremental mode, so not recreating it.")) } } } if (any(unlist(createTableFlagList, use.names = FALSE))) { - ParallelLogger::logInfo("Creating cohort tables") + rlang::inform("Creating cohort tables") createSampleTable <- ifelse( test = is.null(createTableFlagList$cohortSampleTable), yes = FALSE, @@ -137,7 +137,7 @@ createCohortTables <- function(connectionDetails = NULL, DatabaseConnector::executeSql(connection, sql, progressBar = FALSE, reportOverallTime = FALSE) logCreateTableMessage <- function(schema, tableName) { - ParallelLogger::logInfo("- Created table ", schema, ".", tableName) + rlang::inform(paste0("- Created table ", schema, ".", tableName)) } for (i in 1:length(createTableFlagList)) { if (createTableFlagList[[i]]) { @@ -146,7 +146,7 @@ createCohortTables <- function(connectionDetails = NULL, } delta <- Sys.time() - start - ParallelLogger::logInfo("Creating cohort tables took ", round(delta, 2), attr(delta, "units")) + rlang::inform(paste0("Creating cohort tables took ", round(delta, 2), attr(delta, "units"))) } } @@ -173,7 +173,7 @@ dropCohortStatsTables <- function(connectionDetails = NULL, # Export the stats dropTable <- function(table) { - ParallelLogger::logInfo("- Dropping ", table) + rlang::inform(paste0("- Dropping ", table)) sql <- "TRUNCATE TABLE @cohort_database_schema.@table; DROP TABLE @cohort_database_schema.@table;" DatabaseConnector::renderTranslateExecuteSql( diff --git a/R/CsvHelper.R b/R/CsvHelper.R index 25d3d10..a1a55b1 100644 --- a/R/CsvHelper.R +++ b/R/CsvHelper.R @@ -28,12 +28,42 @@ #' @param warnOnCaseMismatch When TRUE, raise a warning if column headings #' in the .csv are not in snake_case format #' +#' @param colTypes Corresponds to the `col_types` in the `readr::read_csv` function. +#' One of `NULL`, a [readr::cols()] specification, or +#' a string. See `vignette("readr")` for more details. +#' +#' If `NULL`, all column types will be inferred from `guess_max` rows of the +#' input, interspersed throughout the file. This is convenient (and fast), +#' but not robust. If the guessed types are wrong, you'll need to increase +#' `guess_max` or supply the correct types yourself. +#' +#' Column specifications created by [list()] or [cols()] must contain +#' one column specification for each column. +#' +#' Alternatively, you can use a compact string representation where each +#' character represents one column: +#' - c = character +#' - i = integer +#' - n = number +#' - d = double +#' - l = logical +#' - f = factor +#' - D = date +#' - T = date time +#' - t = time +#' - ? = guess +#' - _ or - = skip +#' +#' By default, reading a file without a column specification will print a +#' message showing what `readr` guessed they were. To remove this message, +#' set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`. +#' #' @return #' A tibble with the .csv contents #' #' @export -readCsv <- function(file, warnOnCaseMismatch = TRUE) { - fileContents <- .readCsv(file = file) +readCsv <- function(file, warnOnCaseMismatch = TRUE, colTypes = readr::cols()) { + fileContents <- .readCsv(file = file, colTypes = colTypes) columnNames <- colnames(fileContents) columnNamesInSnakeCaseFormat <- isSnakeCase(columnNames) if (!all(columnNamesInSnakeCaseFormat) && warnOnCaseMismatch) { @@ -58,10 +88,10 @@ readCsv <- function(file, warnOnCaseMismatch = TRUE) { #' #' @noRd #' @keywords internal -.readCsv <- function(file) { +.readCsv <- function(file, colTypes = readr::cols()) { invisible(readr::read_csv( file = file, - col_types = readr::cols(), + col_types = colTypes, lazy = FALSE )) } diff --git a/R/Export.R b/R/Export.R index f351821..bab20f6 100644 --- a/R/Export.R +++ b/R/Export.R @@ -19,7 +19,14 @@ #' @description #' This function retrieves the data from the cohort statistics tables and #' writes them to the inclusion statistics folder specified in the function -#' call. +#' call. NOTE: inclusion rule names are handled in one of two ways: +#' +#' 1. You can specify the cohortDefinitionSet parameter and the inclusion rule +#' names will be extracted from the data.frame. +#' 2. You can insert the inclusion rule names into the database using the +#' insertInclusionRuleNames function of this package. +#' +#' The first approach is preferred as to avoid the warning emitted. #' #' @template Connection #' @@ -39,6 +46,11 @@ #' @param databaseId Optional - when specified, the databaseId will be added #' to the exported results #' +#' @template CohortDefinitionSet +#' +#' @param tablePrefix Optional - allows to append a prefix to the exported +#' file names. +#' #' @export exportCohortStatsTables <- function(connectionDetails, connection = NULL, @@ -48,7 +60,9 @@ exportCohortStatsTables <- function(connectionDetails, snakeCaseToCamelCase = TRUE, fileNamesInSnakeCase = FALSE, incremental = FALSE, - databaseId = NULL) { + databaseId = NULL, + cohortDefinitionSet = NULL, + tablePrefix = "") { if (is.null(connection)) { # Establish the connection and ensure the cleanup is performed connection <- DatabaseConnector::connect(connectionDetails) @@ -59,21 +73,12 @@ exportCohortStatsTables <- function(connectionDetails, dir.create(cohortStatisticsFolder, recursive = TRUE) } - # Export the stats - exportStats <- function(table, + # Internal function to export the stats + exportStats <- function(data, fileName, - includeDatabaseId) { - data <- getStatsTable( - connection = connection, - table = table, - snakeCaseToCamelCase = snakeCaseToCamelCase, - databaseId = databaseId, - cohortDatabaseSchema = cohortDatabaseSchema, - includeDatabaseId = includeDatabaseId - ) - - fullFileName <- file.path(cohortStatisticsFolder, fileName) - ParallelLogger::logInfo("- Saving data to - ", fullFileName) + tablePrefix) { + fullFileName <- file.path(cohortStatisticsFolder, paste0(tablePrefix, fileName)) + rlang::inform(paste0("- Saving data to - ", fullFileName)) if (incremental) { if (snakeCaseToCamelCase) { cohortDefinitionIds <- unique(data$cohortDefinitionId) @@ -88,39 +93,113 @@ exportCohortStatsTables <- function(connectionDetails, } tablesToExport <- data.frame( - tableName = cohortTableNames$cohortInclusionTable, - fileName = "cohort_inclusion.csv", - includeDatabaseId = FALSE + tableName = c("cohortInclusionResultTable", "cohortInclusionStatsTable", "cohortSummaryStatsTable", "cohortCensorStatsTable"), + fileName = c("cohort_inc_result.csv", "cohort_inc_stats.csv", "cohort_summary_stats.csv", "cohort_censor_stats.csv") + ) + + if (is.null(cohortDefinitionSet)) { + warning("No cohortDefinitionSet specified; please make sure you've inserted the inclusion rule names using the insertInclusionRuleNames function.") + tablesToExport <- rbind(tablesToExport, data.frame( + tableName = "cohortInclusionTable", + fileName = paste0(tablePrefix, "cohort_inclusion.csv") + )) + } else { + inclusionRules <- getCohortInclusionRules(cohortDefinitionSet) + names(inclusionRules) <- SqlRender::camelCaseToSnakeCase(names(inclusionRules)) + exportStats( + data = inclusionRules, + fileName = "cohort_inclusion.csv", + tablePrefix = tablePrefix + ) + } + + # Get the cohort statistics + cohortStats <- getCohortStats( + connectionDetails = connectionDetails, + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + databaseId = databaseId, + snakeCaseToCamelCase = snakeCaseToCamelCase, + cohortTableNames = cohortTableNames ) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortInclusionResultTable, - fileName = "cohort_inc_result.csv", - includeDatabaseId = TRUE - )) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortInclusionStatsTable, - fileName = "cohort_inc_stats.csv", - includeDatabaseId = TRUE - )) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortSummaryStatsTable, - fileName = "cohort_summary_stats.csv", - includeDatabaseId = TRUE - )) - tablesToExport <- rbind(tablesToExport, data.frame( - tableName = cohortTableNames$cohortCensorStatsTable, - fileName = "cohort_censor_stats.csv", - includeDatabaseId = TRUE - )) + for (i in 1:nrow(tablesToExport)) { fileName <- ifelse(test = fileNamesInSnakeCase, yes = tablesToExport$fileName[i], no = SqlRender::snakeCaseToCamelCase(tablesToExport$fileName[i]) ) exportStats( - table = tablesToExport$tableName[i], + data = cohortStats[[tablesToExport$tableName[i]]], fileName = fileName, - includeDatabaseId = tablesToExport$includeDatabaseId[i] + tablePrefix = tablePrefix ) } } + +exportCohortDefinitionSet <- function(outputFolder, cohortDefinitionSet = NULL) { + cohortDefinitions <- createEmptyResult("cg_cohort_definition") + cohortSubsets <- createEmptyResult("cg_cohort_subset_definition") + if (!is.null(cohortDefinitionSet)) { + cdsCohortSubsets <- getSubsetDefinitions(cohortDefinitionSet) + if (length(cdsCohortSubsets) > 0) { + for (i in seq_along(cdsCohortSubsets)) { + cohortSubsets <- rbind( + cohortSubsets, + data.frame( + subsetDefinitionId = cdsCohortSubsets[[i]]$definitionId, + json = as.character(cdsCohortSubsets[[i]]$toJSON()) + ) + ) + } + } else { + # NOTE: In this case the cohortDefinitionSet has no subsets defined + # and so we need to add the additional columns that are defined + # in the function: addCohortSubsetDefinition. To do this, + # we'll construct a copy of the cohortDefinitionSet with a single + # subset to get the proper structure and filter it to the + # cohorts of interest. + cdsCopy <- cohortDefinitionSet %>% + addCohortSubsetDefinition( + cohortSubsetDefintion = createCohortSubsetDefinition( + definitionId = 1, + name = "empty", + subsetOperators = list( + createDemographicSubset() + ) + ) + ) %>% + dplyr::filter(.data$cohortId == cohortDefinitionSet$cohortId) + cohortDefinitionSet <- cdsCopy + } + # Massage and save the cohort definition set + colsToRename <- c("cohortId", "cohortName", "sql", "json") + colInd <- which(names(cohortDefinitionSet) %in% colsToRename) + names(cohortDefinitionSet)[colInd] <- c("cohortDefinitionId", "cohortName", "sqlCommand", "json") + if (!"description" %in% names(cohortDefinitionSet)) { + cohortDefinitionSet$description <- "" + } + cohortDefinitions <- cohortDefinitionSet[, intersect(names(cohortDefinitions), names(cohortDefinitionSet))] + } + writeCsv( + x = cohortDefinitions, + file = file.path(outputFolder, "cg_cohort_definition.csv") + ) + writeCsv( + x = cohortSubsets, + file = file.path(outputFolder, "cg_cohort_subset_definition.csv") + ) +} + +createEmptyResult <- function(tableName) { + columns <- readCsv( + file = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator") + ) %>% + dplyr::filter(.data$tableName == !!tableName) %>% + dplyr::pull(.data$columnName) %>% + SqlRender::snakeCaseToCamelCase() + result <- vector(length = length(columns)) + names(result) <- columns + result <- tibble::as_tibble(t(result), name_repair = "check_unique") + result <- result[FALSE, ] + return(result) +} diff --git a/R/Incremental.R b/R/Incremental.R index de400e4..cb578b0 100644 --- a/R/Incremental.R +++ b/R/Incremental.R @@ -70,7 +70,7 @@ isTaskRequired <- function(..., checksum, recordKeepingFile, verbose = TRUE) { if (verbose) { key <- list(...) key <- paste(sprintf("%s = '%s'", names(key), key), collapse = ", ") - ParallelLogger::logInfo("Skipping ", key, " because it is unchanged from earlier run") + rlang::inform(paste0("Skipping ", key, " because it is unchanged from earlier run")) } return(FALSE) } else { @@ -115,7 +115,7 @@ getRequiredTasks <- function(..., checksum, recordKeepingFile) { tasks$checksum <- NULL if (length(idx) > 0) { text <- paste(sprintf("%s = %s", names(tasks), tasks[idx, ]), collapse = ", ") - ParallelLogger::logInfo("Skipping ", text, " because it is unchanged from earlier run") + rlang::inform(paste0("Skipping ", text, " because it is unchanged from earlier run")) tasks <- tasks[-idx, ] } } diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index 516662c..a031078 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -29,36 +29,17 @@ #' @export createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) { checkmate::assert_logical(verbose) - negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification() + df <- data.frame( + cohortId = numeric(), + cohortName = character(), + outcomeConceptId = numeric() + ) if (verbose) { - print(negativeControlOutcomeCohortSetSpecification) + print(df) } - # Build the data.frame dynamically - df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification) invisible(df) } -#' Helper function to return the specification description of a -#' negativeControlOutcomeCohortSet -#' -#' @description -#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv -#' to return a data.frame that describes the required columns in a -#' negativeControlOutcomeCohortSet -#' -#' @return -#' Returns a data.frame that defines a negativeControlOutcomeCohortSet -#' -#' @noRd -#' @keywords internal -.getNegativeControlOutcomeCohortSetSpecification <- function() { - return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE - ))) -} - - #' Generate a set of negative control outcome cohorts #' #' @description @@ -111,12 +92,14 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first")) checkmate::assert_logical(detectOnDescendants) checkmate::assertNames(colnames(negativeControlOutcomeCohortSet), - must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName + must.include = names(createEmptyNegativeControlOutcomeCohortSet()) ) checkmate::assert_data_frame( x = negativeControlOutcomeCohortSet, min.rows = 1 ) + assertLargeInteger(negativeControlOutcomeCohortSet$cohortId) + assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId") # Verify that cohort IDs are not repeated in the negative control # cohort definition set before generating @@ -163,21 +146,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, stop(paste0("Table: ", cohortTable, " not found in schema: ", cohortDatabaseSchema, ". Please use `createCohortTable` to ensure the cohort table is created before generating cohorts.")) } - ParallelLogger::logInfo("Generating negative control outcome cohorts") - - # Send the negative control outcome cohort set to the server for use - # in processing. This temp table will hold the mapping between - # cohort_definition_id and the outcomeConceptId in the data.frame() - DatabaseConnector::insertTable( - connection = connection, - data = negativeControlOutcomeCohortSet, - tempEmulationSchema = tempEmulationSchema, - tableName = "#nc_set", - camelCaseToSnakeCase = TRUE, - dropTableIfExists = TRUE, - createTable = TRUE, - tempTable = TRUE - ) + rlang::inform("Generating negative control outcome cohorts") sql <- createNegativeControlOutcomesQuery( connection = connection, @@ -186,7 +155,8 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = cohortTable, occurrenceType = occurrenceType, - detectOnDescendants = detectOnDescendants + detectOnDescendants = detectOnDescendants, + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet ) DatabaseConnector::executeSql( @@ -213,7 +183,31 @@ createNegativeControlOutcomesQuery <- function(connection, cohortDatabaseSchema, cohortTable, occurrenceType, - detectOnDescendants) { + detectOnDescendants, + negativeControlOutcomeCohortSet) { + selectClause <- "" + for (i in 1:nrow(negativeControlOutcomeCohortSet)) { + selectClause <- paste0( + selectClause, + "SELECT CAST(", negativeControlOutcomeCohortSet$cohortId[i], " AS BIGINT), ", + "CAST(", negativeControlOutcomeCohortSet$outcomeConceptId[i], " AS BIGINT)" + ) + if (i < nrow(negativeControlOutcomeCohortSet)) { + selectClause <- paste0(selectClause, "\nUNION\n") + } + } + selectClause + ncSetQuery <- paste0( + "CREATE TABLE #nc_set (", + " cohort_id bigint NOT NULL,", + " outcome_concept_id bigint NOT NULL", + ")", + ";", + "INSERT INTO #nc_set (cohort_id, outcome_concept_id)\n", + selectClause, + "\n;" + ) + sql <- sql <- SqlRender::readSql(system.file("sql/sql_server/NegativeControlOutcomes.sql", package = "CohortGenerator", mustWork = TRUE)) sql <- SqlRender::render( sql = sql, @@ -222,6 +216,7 @@ createNegativeControlOutcomesQuery <- function(connection, cohort_table = cohortTable, detect_on_descendants = detectOnDescendants, occurrence_type = occurrenceType, + nc_set_query = ncSetQuery, warnOnMissingParameters = TRUE ) sql <- SqlRender::translate( diff --git a/R/ResultsDataModel.R b/R/ResultsDataModel.R new file mode 100644 index 0000000..07643f0 --- /dev/null +++ b/R/ResultsDataModel.R @@ -0,0 +1,147 @@ +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of CohortGenerator +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#' Get specifications for CohortGenerator results data model +#' +#' @return +#' A tibble data frame object with specifications +#' +#' @export +getResultsDataModelSpecifications <- function() { + resultsDataModelSpecifications <- readCsv( + file = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator") + ) + return(resultsDataModelSpecifications) +} + +#' Create the results data model tables on a database server. +#' +#' @details +#' Only PostgreSQL and SQLite servers are supported. +#' +#' @param connectionDetails DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails] +#' @param databaseSchema The schema on the server where the tables will be created. +#' @param tablePrefix (Optional) string to insert before table names for database table names +#' @export +createResultsDataModel <- function(connectionDetails = NULL, + databaseSchema, + tablePrefix = "") { + if (connectionDetails$dbms == "sqlite" & databaseSchema != "main") { + stop("Invalid schema for sqlite, use databaseSchema = 'main'") + } + + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + # Create first version of results model: + sql <- SqlRender::readSql(system.file("sql/sql_server/CreateResultsDataModel.sql", package = "CohortGenerator", mustWork = TRUE)) + sql <- SqlRender::render( + sql = sql, + database_schema = databaseSchema, + table_prefix = tablePrefix + ) + sql <- SqlRender::translate(sql = sql, targetDialect = connection@dbms) + DatabaseConnector::executeSql(connection, sql) + # Migrate to current version: + migrateDataModel( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix + ) +} + +#' Upload results to the database server. +#' +#' @description +#' Requires the results data model tables have been created using the \code{\link{createResultsDataModel}} function. +#' +#' @param connectionDetails An object of type \code{connectionDetails} as created using the +#' \code{\link[DatabaseConnector]{createConnectionDetails}} function in the +#' DatabaseConnector package. +#' @param schema The schema on the server where the tables have been created. +#' @param resultsFolder The folder holding the results in .csv files +#' @param forceOverWriteOfSpecifications If TRUE, specifications of the phenotypes, cohort definitions, and analysis +#' will be overwritten if they already exist on the database. Only use this if these specifications +#' have changed since the last upload. +#' @param purgeSiteDataBeforeUploading If TRUE, before inserting data for a specific databaseId all the data for +#' that site will be dropped. This assumes the resultsFolder file contains the full data for that +#' data site. +#' @param tablePrefix (Optional) string to insert before table names for database table names +#' @param ... See ResultModelManager::uploadResults +#' @export +uploadResults <- function(connectionDetails, + schema, + resultsFolder, + forceOverWriteOfSpecifications = FALSE, + purgeSiteDataBeforeUploading = TRUE, + tablePrefix = "", + ...) { + ResultModelManager::uploadResults( + connectionDetails = connectionDetails, + schema = schema, + resultsFolder = resultsFolder, + tablePrefix = tablePrefix, + forceOverWriteOfSpecifications = forceOverWriteOfSpecifications, + purgeSiteDataBeforeUploading = purgeSiteDataBeforeUploading, + runCheckAndFixCommands = FALSE, + specifications = getResultsDataModelSpecifications(), + warnOnMissingTable = FALSE, + ... + ) +} + +#' Migrate Data model +#' @description +#' Migrate data from current state to next state +#' +#' It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +#' are using a postgres backend) or have kept the csv/zip files from your data generation. +#' +#' @inheritParams getDataMigrator +#' @export +migrateDataModel <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ParallelLogger::logInfo("Migrating data set") + migrator <- getDataMigrator( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix + ) + migrator$executeMigrations() + migrator$finalize() +} + +#' Get database migrations instance +#' @description +#' +#' Returns ResultModelManager DataMigrationsManager instance. +# '@seealso [ResultModelManager::DataMigrationManager] which this function is a utility for. +#' +#' @param connectionDetails DatabaseConnector connection details object +#' @param databaseSchema String schema where database schema lives +#' @param tablePrefix (Optional) Use if a table prefix is used before table names (e.g. "cg_") +#' @returns Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models +#' @export +getDataMigrator <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ResultModelManager::DataMigrationManager$new( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix, + packageTablePrefix = "cg_", + migrationPath = "migrations", + packageName = "CohortGenerator" + ) +} diff --git a/R/RunCohortGeneration.R b/R/RunCohortGeneration.R new file mode 100644 index 0000000..e1565d0 --- /dev/null +++ b/R/RunCohortGeneration.R @@ -0,0 +1,306 @@ +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of CohortGenerator +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Run a cohort generation and export results +#' +#' @details +#' Run a cohort generation for a set of cohorts and negative control outcomes. +#' This function will also export the results of the run to the `outputFolder`. +#' +#' @param connectionDetails An object of type \code{connectionDetails} as created using the +#' \code{\link[DatabaseConnector]{createConnectionDetails}} function in the +#' DatabaseConnector package. +#' +#' @template CdmDatabaseSchema +#' +#' @template TempEmulationSchema +#' +#' @template CohortTableNames +#' +#' @template CohortDefinitionSet +#' +#' @template NegativeControlOutcomeCohortSet +#' +#' @param occurrenceType For negative controls outcomes, the occurrenceType +#' will detect either: the first time an +#' outcomeConceptId occurs or all times the +#' outcomeConceptId occurs for a person. Values +#' accepted: 'all' or 'first'. +#' +#' @param detectOnDescendants For negative controls outcomes, when set to TRUE, +#' detectOnDescendants will use the vocabulary to +#' find negative control outcomes using the +#' outcomeConceptId and all descendants via the +#' concept_ancestor table. When FALSE, only the exact +#' outcomeConceptId will be used to detect the +#' outcome. +#' +#' @param stopOnError If an error happens while generating one of the +#' cohorts in the cohortDefinitionSet, should we +#' stop processing the other cohorts? The default is +#' TRUE; when set to FALSE, failures will be +#' identified in the return value from this function. +#' +#' @param outputFolder Name of the folder where all the outputs will written to. +#' +#' @param databaseId A unique ID for the database. This will be appended to +#' most tables. +#' +#' @param incremental Create only cohorts that haven't been created before? +#' +#' @param incrementalFolder If \code{incremental = TRUE}, specify a folder where +#' records are kept of which definition has been +#' executed. +#' +#' @export +runCohortGeneration <- function(connectionDetails, + cdmDatabaseSchema, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + cohortDatabaseSchema = cdmDatabaseSchema, + cohortTableNames = getCohortTableNames(), + cohortDefinitionSet = NULL, + negativeControlOutcomeCohortSet = NULL, + occurrenceType = "all", + detectOnDescendants = FALSE, + stopOnError = TRUE, + outputFolder, + databaseId = 1, + incremental = FALSE, + incrementalFolder = NULL) { + if (is.null(cohortDefinitionSet) && is.null(negativeControlOutcomeCohortSet)) { + stop("You must supply at least 1 cohortDefinitionSet OR 1 negativeControlOutcomeCohortSet") + } + errorMessages <- checkmate::makeAssertCollection() + if (is(connectionDetails, "connectionDetails")) { + checkmate::assertClass(connectionDetails, "connectionDetails", add = errorMessages) + } else { + checkmate::assertClass(connectionDetails, "ConnectionDetails", add = errorMessages) + } + checkmate::assertCharacter(cdmDatabaseSchema, len = 1, add = errorMessages) + checkmate::assertCharacter(tempEmulationSchema, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertCharacter(cohortDatabaseSchema, len = 1, add = errorMessages) + checkmate::assertList(cohortTableNames, min.len = 1, add = errorMessages) + checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertDataFrame(negativeControlOutcomeCohortSet, min.rows = 1, null.ok = TRUE, add = errorMessages) + checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"), add = errorMessages) + checkmate::assert_logical(detectOnDescendants, add = errorMessages) + checkmate::assert_logical(stopOnError, add = errorMessages) + checkmate::reportAssertions(collection = errorMessages) + + # Establish the connection and ensure the cleanup is performed + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + # Create the export folder + if (!dir.exists(outputFolder)) { + dir.create(outputFolder, recursive = T) + } + + # Create the cohort tables + createCohortTables( + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + incremental = incremental + ) + + generateAndExportCohorts( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + cohortDefinitionSet = cohortDefinitionSet, + stopOnError = stopOnError, + outputFolder = outputFolder, + databaseId = databaseId, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + generateAndExportNegativeControls( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet, + occurrenceType = occurrenceType, + detectOnDescendants = detectOnDescendants, + outputFolder = outputFolder, + databaseId = databaseId, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + # Export the results data model specification + file.copy( + from = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator"), + to = outputFolder + ) + + rlang::inform("Cohort generation complete.") +} + +generateAndExportCohorts <- function(connection, + cdmDatabaseSchema, + tempEmulationSchema, + cohortDatabaseSchema, + cohortTableNames, + cohortDefinitionSet, + stopOnError, + outputFolder, + databaseId, + incremental, + incrementalFolder) { + # Generate the cohorts + cohortsGenerated <- createEmptyResult("cg_cohort_generation") + cohortsGeneratedFileName <- file.path(outputFolder, "cg_cohort_generation.csv") + cohortCounts <- createEmptyResult("cg_cohort_count") + cohortCountsFileName <- file.path(outputFolder, "cg_cohort_count.csv") + if (!is.null(cohortDefinitionSet)) { + # Generate cohorts, get counts, write results + cohortsGenerated <- generateCohortSet( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + cohortDefinitionSet = cohortDefinitionSet, + stopOnError = stopOnError, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + cohortCountsFromDb <- getCohortCounts( + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTableNames$cohortTable, + cohortDefinitionSet = cohortDefinitionSet, + databaseId = databaseId + ) + + # Filter to columns in the results data model + cohortCounts <- cohortCountsFromDb[names(cohortCounts)] + } + + # Save the generation information + rlang::inform("Saving cohort generation information") + if (!is.null(cohortsGenerated) && nrow(cohortsGenerated) > 0) { + cohortsGenerated$databaseId <- databaseId + # Remove any cohorts that were skipped + cohortsGenerated <- cohortsGenerated[toupper(cohortsGenerated$generationStatus) != "SKIPPED", ] + if (incremental) { + # Format the data for saving + names(cohortsGenerated) <- SqlRender::camelCaseToSnakeCase(names(cohortsGenerated)) + saveIncremental( + data = cohortsGenerated, + fileName = cohortsGeneratedFileName, + cohort_id = cohortsGenerated$cohort_id + ) + } else { + writeCsv( + x = cohortsGenerated, + file = cohortsGeneratedFileName + ) + } + } + + rlang::inform("Saving cohort counts") + writeCsv( + x = cohortCounts, + file = cohortCountsFileName + ) + + rlang::inform("Saving cohort statistics") + exportCohortStatsTables( + connection = connection, + cohortTableNames = cohortTableNames, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortStatisticsFolder = outputFolder, + snakeCaseToCamelCase = FALSE, + fileNamesInSnakeCase = TRUE, + incremental = incremental, + databaseId = databaseId, + cohortDefinitionSet = cohortDefinitionSet, + tablePrefix = "cg_" + ) + + # Export the cohort definition set + rlang::inform("Saving cohort definition set") + exportCohortDefinitionSet(outputFolder, cohortDefinitionSet) +} + +generateAndExportNegativeControls <- function(connection, + cdmDatabaseSchema, + tempEmulationSchema, + cohortDatabaseSchema, + cohortTableNames, + negativeControlOutcomeCohortSet, + occurrenceType, + detectOnDescendants, + outputFolder, + databaseId, + incremental, + incrementalFolder) { + # Generate any negative controls + negativeControlOutcomes <- createEmptyResult("cg_cohort_definition_neg_ctrl") + negativeControlOutcomesFileName <- file.path(outputFolder, "cg_cohort_definition_neg_ctrl.csv") + cohortCountsNegativeControlOutcomes <- createEmptyResult("cg_cohort_count_neg_ctrl") + cohortCountsNegativeControlOutcomesFileName <- file.path(outputFolder, "cg_cohort_count_neg_ctrl.csv") + if (!is.null(negativeControlOutcomeCohortSet)) { + generateNegativeControlOutcomeCohorts( + connection = connection, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTableNames$cohortTable, + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet, + tempEmulationSchema = tempEmulationSchema, + occurrenceType = occurrenceType, + detectOnDescendants = detectOnDescendants, + incremental = incremental, + incrementalFolder = incrementalFolder + ) + + # Assemble the negativeControlOutcomes for export + negativeControlOutcomes <- cbind( + negativeControlOutcomeCohortSet, + occurrenceType = rep(occurrenceType, nrow(negativeControlOutcomeCohortSet)), + detectOnDescendants = rep(detectOnDescendants, nrow(negativeControlOutcomeCohortSet)) + ) + + # Count the negative controls + cohortCountsNegativeControlOutcomes <- getCohortCounts( + connection = connection, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTableNames$cohortTable, + databaseId = databaseId, + cohortDefinitionSet = negativeControlOutcomeCohortSet[, c("cohortId"), drop = FALSE] + ) + } + + rlang::inform("Saving negative control outcome cohort definition") + writeCsv( + x = negativeControlOutcomes, + file = negativeControlOutcomesFileName + ) + + rlang::inform("Saving negative control outcome cohort counts") + writeCsv( + x = cohortCountsNegativeControlOutcomes, + file = cohortCountsNegativeControlOutcomesFileName + ) +} diff --git a/R/SubsetDefinitions.R b/R/SubsetDefinitions.R index 4453fd6..19911c7 100644 --- a/R/SubsetDefinitions.R +++ b/R/SubsetDefinitions.R @@ -64,7 +64,7 @@ CohortSubsetDefinition <- R6::R6Class( subsetOperators = lapply(self$subsetOperators, function(operator) { operator$toList() }), - packageVersion = jsonlite::unbox(as.character(utils::packageVersion(utils::packageName()))), + packageVersion = jsonlite::unbox(as.character(utils::packageVersion("CohortGenerator"))), identifierExpression = jsonlite::unbox(as.character(private$.identifierExpression)), operatorNameConcatString = jsonlite::unbox(as.character(private$.operatorNameConcatString)), subsetCohortNameTemplate = jsonlite::unbox(as.character(private$.subsetCohortNameTemplate)) @@ -471,7 +471,7 @@ hasSubsetDefinitions <- function(x) { #' @description #' This is generally used as part of saveCohortDefinitionSet #' -#' @param subsetDefinition The subset definition object {@seealso CohortSubsetDefinition} +#' @param subsetDefinition The subset definition object @seealso[CohortSubsetDefinition] #' #' @export #' @inheritParams saveCohortDefinitionSet diff --git a/R/Subsets.R b/R/Subsets.R index c50a0f8..c157f72 100644 --- a/R/Subsets.R +++ b/R/Subsets.R @@ -35,7 +35,7 @@ # SubsetCohortWindow ------------- -#' SubsetCohortWindow settings +#' @title Time Window For Cohort Subset Operator #' @export #' @description #' Representation of a time window to use when subsetting a target cohort with a subset cohort @@ -47,7 +47,6 @@ SubsetCohortWindow <- R6::R6Class( .targetAnchor = "cohortStart" ), public = list( - #' @title to List #' @description List representation of object toList = function() { objRepr <- list() @@ -128,7 +127,7 @@ createSubsetCohortWindow <- function(startDay, endDay, targetAnchor) { } # SubsetOperator ------------------------------ -#' @title SubsetOperator +#' @title Abstract base class for subsets. #' @export #' @description #' Abstract Base Class for subsets. Subsets should inherit from this and implement their own requirements. @@ -403,7 +402,7 @@ CohortSubsetOperator <- R6::R6Class( #' #' @param startWindow A SubsetCohortWindow that patients must fall inside (see createSubsetCohortWindow) #' @param endWindow A SubsetCohortWindow that patients must fall inside (see createSubsetCohortWindow) -#' @param negate The opposite of this definition - include patients who do NOT meet the specified criteria (NOT YET IMPLEMENTED) +#' @param negate The opposite of this definition - include patients who do NOT meet the specified criteria #' @returns a CohortSubsetOperator instance createCohortSubset <- function(name = NULL, cohortIds, cohortCombinationOperator, negate, startWindow, endWindow) { subset <- CohortSubsetOperator$new() @@ -418,7 +417,10 @@ createCohortSubset <- function(name = NULL, cohortIds, cohortCombinationOperator } # DemographicSubsetOperator ------------------------------ -#' Criteria Subset +#' @title Demographic Subset Operator +#' @description +#' Operators for subsetting a cohort by demographic criteria +#' #' @export DemographicSubsetOperator <- R6::R6Class( classname = "DemographicSubsetOperator", diff --git a/_pkgdown.yml b/_pkgdown.yml index d0aa7e1..1258641 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -8,6 +8,13 @@ home: href: http://forums.ohdsi.org reference: + - title: "Cohort Generation" + desc: > + Functions that support generating cohorts. + contents: + - runCohortGeneration + - generateCohortSet + - title: "Cohort Tables" desc: > Functions that support creating the necessary cohort tables. @@ -25,13 +32,6 @@ reference: - checkAndFixCohortDefinitionSetDataTypes - isCohortDefinitionSet - - title: "Cohort Generation" - desc: > - Functions that support generating cohorts. - contents: - - generateCohortSet - - createEmptyCohortDefinitionSet - - title: "Cohort Counts" desc: > Function for obtaining the counts of subjects and events for one or @@ -39,23 +39,27 @@ reference: contents: - getCohortCounts - - title: "Cohort Subset" + - title: "Cohort Subset Functions" desc: > - Functions and R6 classes for creating cohort subset definitions and subset - operators. + Functions for creating cohort subset definitions and subset operators. contents: - addCohortSubsetDefinition - - CohortSubsetDefinition - - CohortSubsetOperator - createCohortSubset - createCohortSubsetDefinition - createDemographicSubset - createLimitSubset - createSubsetCohortWindow - - DemographicSubsetOperator - getSubsetDefinitions - - LimitSubsetOperator - saveCohortSubsetDefinition + + - title: "Cohort Subset Classes" + desc: > + R6 classes for cohort subset definitions and subset operators. + contents: + - CohortSubsetDefinition + - CohortSubsetOperator + - DemographicSubsetOperator + - LimitSubsetOperator - SubsetCohortWindow - SubsetOperator @@ -63,9 +67,12 @@ reference: desc: > Functions for inserting inclusion rule names from a cohort definition, exporting the cohort statistics to the file system and a helper function - for dropping those tables when they are no longer needed. + for dropping those tables when they are no longer needed. These functions + assume you are using [Circe](https://github.com/OHDSI/circe-be) for + inclusion rules and cohort statistics. contents: - getCohortStats + - getCohortInclusionRules - insertInclusionRuleNames - exportCohortStatsTables - dropCohortStatsTables @@ -77,6 +84,17 @@ reference: contents: - createEmptyNegativeControlOutcomeCohortSet - generateNegativeControlOutcomeCohorts + + - title: "Result Model Management" + desc: > + Functions for managing the results of running Cohort Generator via + `runCohortGeneration` + contents: + - createResultsDataModel + - getDataMigrator + - getResultsDataModelSpecifications + - migrateDataModel + - uploadResults - title: "CSV File Helpers" desc: > diff --git a/docs/404.html b/docs/404.html index 1fb5c5e..da143bd 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/articles/CreatingCohortSubsetDefinitions.html b/docs/articles/CreatingCohortSubsetDefinitions.html index 5e8b709..71a5f0e 100644 --- a/docs/articles/CreatingCohortSubsetDefinitions.html +++ b/docs/articles/CreatingCohortSubsetDefinitions.html @@ -33,7 +33,7 @@ CohortGenerator - 0.9.0 + 0.10.0 @@ -91,7 +91,7 @@

Creating Cohort Subset Definitions

James P. Gilbert and Anthony G. Sena

-

2024-05-28

+

2024-07-14

Source: vignettes/CreatingCohortSubsetDefinitions.Rmd @@ -277,6 +277,15 @@

Applying subset NA +1778214 +celecoxibCensored +1778214 + +1778214 +FALSE +NA + + 1778211001 celecoxib - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, Observation @@ -287,7 +296,7 @@

Applying subset TRUE 1 - + 1778212001 celecoxibAge40 - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, @@ -298,7 +307,7 @@

Applying subset TRUE 1 - + 1778213001 celecoxibAge40Male - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, @@ -309,6 +318,17 @@

Applying subset TRUE 1 + +1778214001 +celecoxibCensored - Patients in cohort cohort 1778213 +with 365 days prior observation Subset to patients in cohort 1778213, +Observation of at least 365 days prior +NA +NA +1778214 +TRUE +1 +

We can also apply a subset definition to only a limited number of @@ -366,6 +386,15 @@

Applying subset NA +1778214 +celecoxibCensored +1778214 + +1778214 +FALSE +NA + + 1778211001 celecoxib - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, Observation @@ -376,7 +405,7 @@

Applying subset TRUE 1 - + 1778212001 celecoxibAge40 - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, @@ -387,7 +416,7 @@

Applying subset TRUE 1 - + 1778213001 celecoxibAge40Male - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, @@ -398,6 +427,17 @@

Applying subset TRUE 1 + +1778214001 +celecoxibCensored - Patients in cohort cohort 1778213 +with 365 days prior observation Subset to patients in cohort 1778213, +Observation of at least 365 days prior +NA +NA +1778214 +TRUE +1 + 1778212002 celecoxibAge40 - Patients in cohort 1778213 with 365 @@ -439,9 +479,9 @@

Applying subset paste("Subset Parent Id:", cohortDefinitionSet$subsetParent[4]), paste("Name", cohortDefinitionSet$cohortName[4]) )) -
#> Cohort Id: 1778211001
-#> Subset Parent Id: 1778211
-#> Name celecoxib - Patients in cohort cohort 1778213 with 365 days prior observation Subset to patients in cohort 1778213, Observation of at least 365 days prior
+
#> Cohort Id: 1778214
+#> Subset Parent Id: 1778214
+#> Name celecoxibCensored

Note that when adding a subset definition to a cohort definition set, the target cohort ids e.g (1778211, 1778212) must exist in the cohortDefinitionSet and the output ids diff --git a/docs/articles/GeneratingCohorts.html b/docs/articles/GeneratingCohorts.html index 2762151..69526fe 100644 --- a/docs/articles/GeneratingCohorts.html +++ b/docs/articles/GeneratingCohorts.html @@ -33,7 +33,7 @@ CohortGenerator - 0.9.0 + 0.10.0 @@ -91,7 +91,7 @@

Generating Cohorts

Anthony G. Sena and Martijn J. Schuemie

-

2024-05-28

+

2024-07-14

Source: vignettes/GeneratingCohorts.Rmd @@ -262,12 +262,13 @@

Generating Cohorts= "main", cohortTable = cohortTableNames$cohortTable ) -
#> Connecting using SQLite driver
-
#> Counting cohorts took 0.126 secs
+
#> Connecting using SQLite driver
+#> Counting cohorts took 0.131 secs
#>   cohortId cohortEntries cohortSubjects
 #> 1  1778211          1800           1800
 #> 2  1778212           569            569
-#> 3  1778213           266            266
+#> 3 1778213 266 266 +#> 4 1778214 1750 1750
@@ -287,7 +288,7 @@

Cohort Statistics (Inclusio CohortDiagnostics. Building on our basic example, let’s export the cohorts from WebAPI but this time indicate that we’d like to also include the code that generatesStats:

-
+
 

Next we’ll create the tables to store the cohort and the cohort statistics. Then we can generate the cohorts.

-
+
 # First get the cohort table names to use for this generation task
 cohortTableNames <- getCohortTableNames(cohortTable = "stats_example")
 
@@ -321,7 +322,7 @@ 

Cohort Statistics (Inclusio are available in the cohort statistics tables. The next step is to export the results to the file system which is done using the code below:

-
+
 insertInclusionRuleNames(
   connectionDetails = connectionDetails,
   cohortDefinitionSet = cohortDefinitionSet,
@@ -344,7 +345,7 @@ 

Cohort Statistics (Inclusio table in the InclusionStats folder.

Once you have exported your cohort statistics, you can optionally drop the statistics tables by using the following command:

-
+
 dropCohortStatsTables(
   connectionDetails = connectionDetails,
   cohortDatabaseSchema = "main",
@@ -362,7 +363,7 @@ 

Incremental Mode -
+
 # Create a set of tables for this example
 cohortTableNames <- getCohortTableNames(cohortTable = "cohort")
 createCohortTables(
@@ -376,15 +377,15 @@ 

Incremental Mode -
+
 createCohortTables(
   connectionDetails = connectionDetails,
   cohortTableNames = cohortTableNames,
   cohortDatabaseSchema = "main",
   incremental = TRUE
 )
-
#> Connecting using SQLite driver
-
#> Table "cohort" already exists and in incremental mode, so not recreating it.
+
#> Connecting using SQLite driver
+#> Table "cohort" already exists and in incremental mode, so not recreating it.
 #> Table "cohort" already exists and in incremental mode, so not recreating it.
 #> Table "cohort_inclusion" already exists and in incremental mode, so not recreating it.
 #> Table "cohort_inclusion_result" already exists and in incremental mode, so not recreating it.
@@ -394,7 +395,7 @@ 

Incremental ModeThe use of incremental = TRUE here allows for assurance that tables and results from previous runs are preserved. Next, we can generate our cohortDefinitionSet in incremental mode.

-
+
 generateCohortSet(
   connectionDetails = connectionDetails,
   cdmDatabaseSchema = "main",
@@ -418,7 +419,7 @@ 

Incremental ModeincrementalFolder for the same cohort ID, the generation is skipped. To illustrate how this looks:

-
+
 generateCohortSet(
   connectionDetails = connectionDetails,
   cdmDatabaseSchema = "main",
@@ -429,11 +430,12 @@ 

Incremental Mode incrementalFolder = file.path(someFolder, "RecordKeeping") )

#> Connecting using SQLite driver
-
#> Initiating cluster consisting only of main thread
-#> Skipping cohortId = '1778211' because it is unchanged from earlier run
+
#> Initiating cluster consisting only of main thread
+
#> Skipping cohortId = '1778211' because it is unchanged from earlier run
 #> Skipping cohortId = '1778212' because it is unchanged from earlier run
 #> Skipping cohortId = '1778213' because it is unchanged from earlier run
-#> Generating cohort set took 0.11 secs
+#> Skipping cohortId = '1778214' because it is unchanged from earlier run
+
#> Generating cohort set took 0.13 secs
Potential Pitfalls of Incremental Mode
diff --git a/docs/articles/SamplingCohorts.html b/docs/articles/SamplingCohorts.html index 76bc9ec..e856ab2 100644 --- a/docs/articles/SamplingCohorts.html +++ b/docs/articles/SamplingCohorts.html @@ -33,7 +33,7 @@ CohortGenerator - 0.9.0 + 0.10.0
@@ -91,7 +91,7 @@

Sampling Cohorts

James P. Gilbert

-

2024-05-28

+

2024-07-14

Source: vignettes/SamplingCohorts.Rmd diff --git a/docs/articles/index.html b/docs/articles/index.html index c0fbc19..23c3513 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/authors.html b/docs/authors.html index 934e4a3..83a259e 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0

diff --git a/docs/index.html b/docs/index.html index e325d4e..e722aa9 100644 --- a/docs/index.html +++ b/docs/index.html @@ -33,7 +33,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/news/index.html b/docs/news/index.html index abf1fc5..caab98e 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0

@@ -66,6 +66,11 @@

Changelog

Source: NEWS.md

+
+ +

New Features - Add runCohortGeneration function (Issue #165) - Adopt ResultModelManager for handling results data models & uploading. Extend results data model to include information on cohort subsets(#154, #162) - Remove REMOTES entries for CirceR and Eunomia which are now in CRAN (#145) - Unit tests now running on all OHDSI DB Platforms (#151)

+

Bug Fixes - Negation of cohort subset operator must join on subject_id AND start_date (#167) - Allow integer as cohort ID (#146) - Use native messaging functions for output vs. ParallelLogger (#97) - Prevent upload of inclusion rule information (#78) - Expose colTypes when working with .csv files (#59) - Remove bit64 from package (mostly) (#152) - Updated documentation for cohort subset negate feature (#111)

+
  • Random sample functionality (for development only) (Issue #129)
  • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index ec62a11..475ec55 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -5,5 +5,5 @@ articles: CreatingCohortSubsetDefinitions: CreatingCohortSubsetDefinitions.html GeneratingCohorts: GeneratingCohorts.html SamplingCohorts: SamplingCohorts.html -last_built: 2024-05-28T17:58Z +last_built: 2024-07-14T13:51Z diff --git a/docs/reference/CohortGenerator-package.html b/docs/reference/CohortGenerator-package.html index 94e32b7..7030844 100644 --- a/docs/reference/CohortGenerator-package.html +++ b/docs/reference/CohortGenerator-package.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/CohortSubsetDefinition.html b/docs/reference/CohortSubsetDefinition.html index 3540129..f71367a 100644 --- a/docs/reference/CohortSubsetDefinition.html +++ b/docs/reference/CohortSubsetDefinition.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/CohortSubsetOperator.html b/docs/reference/CohortSubsetOperator.html index eb899f8..a493c14 100644 --- a/docs/reference/CohortSubsetOperator.html +++ b/docs/reference/CohortSubsetOperator.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0

diff --git a/docs/reference/DemographicSubsetOperator.html b/docs/reference/DemographicSubsetOperator.html index b0286f9..58adfff 100644 --- a/docs/reference/DemographicSubsetOperator.html +++ b/docs/reference/DemographicSubsetOperator.html @@ -1,6 +1,5 @@ -Criteria Subset — DemographicSubsetOperator • CohortGeneratorDemographic Subset Operator — DemographicSubsetOperator • CohortGenerator @@ -18,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
@@ -63,14 +62,13 @@
-

Criteria Subset

-

Criteria Subset

+

Operators for subsetting a cohort by demographic criteria

diff --git a/docs/reference/LimitSubsetOperator.html b/docs/reference/LimitSubsetOperator.html index 22f4022..fbcaf92 100644 --- a/docs/reference/LimitSubsetOperator.html +++ b/docs/reference/LimitSubsetOperator.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/SubsetCohortWindow.html b/docs/reference/SubsetCohortWindow.html index 08abaf9..3047776 100644 --- a/docs/reference/SubsetCohortWindow.html +++ b/docs/reference/SubsetCohortWindow.html @@ -1,5 +1,5 @@ -to List — SubsetCohortWindow • CohortGeneratorTime Window For Cohort Subset Operator — SubsetCohortWindow • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
@@ -62,7 +62,7 @@
@@ -72,10 +72,6 @@

to List

-
-

Details

-

SubsetCohortWindow settings

-

Active bindings

startDay
diff --git a/docs/reference/SubsetOperator.html b/docs/reference/SubsetOperator.html index b9629d1..1281969 100644 --- a/docs/reference/SubsetOperator.html +++ b/docs/reference/SubsetOperator.html @@ -1,5 +1,5 @@ -SubsetOperator — SubsetOperator • CohortGeneratorAbstract base class for subsets. — SubsetOperator • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
@@ -62,7 +62,7 @@
diff --git a/docs/reference/addCohortSubsetDefinition.html b/docs/reference/addCohortSubsetDefinition.html index 90567f9..465e098 100644 --- a/docs/reference/addCohortSubsetDefinition.html +++ b/docs/reference/addCohortSubsetDefinition.html @@ -20,7 +20,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html b/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html index 947ac3f..b2ac30c 100644 --- a/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html +++ b/docs/reference/checkAndFixCohortDefinitionSetDataTypes.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/computeChecksum.html b/docs/reference/computeChecksum.html index 065b877..6b88acc 100644 --- a/docs/reference/computeChecksum.html +++ b/docs/reference/computeChecksum.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/createCohortSubset.html b/docs/reference/createCohortSubset.html index d143bf1..ae84cb4 100644 --- a/docs/reference/createCohortSubset.html +++ b/docs/reference/createCohortSubset.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
@@ -98,7 +98,7 @@

Arguments

negate
-

The opposite of this definition - include patients who do NOT meet the specified criteria (NOT YET IMPLEMENTED)

+

The opposite of this definition - include patients who do NOT meet the specified criteria

startWindow
diff --git a/docs/reference/createCohortSubsetDefinition.html b/docs/reference/createCohortSubsetDefinition.html index 73a692c..3244251 100644 --- a/docs/reference/createCohortSubsetDefinition.html +++ b/docs/reference/createCohortSubsetDefinition.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0

diff --git a/docs/reference/createCohortTables.html b/docs/reference/createCohortTables.html index c28cf0c..4a7c50c 100644 --- a/docs/reference/createCohortTables.html +++ b/docs/reference/createCohortTables.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/createDemographicSubset.html b/docs/reference/createDemographicSubset.html index a3879b9..1b231e9 100644 --- a/docs/reference/createDemographicSubset.html +++ b/docs/reference/createDemographicSubset.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0

diff --git a/docs/reference/createEmptyCohortDefinitionSet.html b/docs/reference/createEmptyCohortDefinitionSet.html index 31f3f6a..5edc70f 100644 --- a/docs/reference/createEmptyCohortDefinitionSet.html +++ b/docs/reference/createEmptyCohortDefinitionSet.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html b/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html index 8c2f1e1..008b62d 100644 --- a/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html +++ b/docs/reference/createEmptyNegativeControlOutcomeCohortSet.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/createLimitSubset.html b/docs/reference/createLimitSubset.html index 4071515..961433f 100644 --- a/docs/reference/createLimitSubset.html +++ b/docs/reference/createLimitSubset.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0

diff --git a/docs/reference/createResultsDataModel.html b/docs/reference/createResultsDataModel.html new file mode 100644 index 0000000..b1f7e10 --- /dev/null +++ b/docs/reference/createResultsDataModel.html @@ -0,0 +1,124 @@ + +Create the results data model tables on a database server. — createResultsDataModel • CohortGenerator + + +
+
+ + + +
+
+ + +
+

Create the results data model tables on a database server.

+
+ +
+
createResultsDataModel(
+  connectionDetails = NULL,
+  databaseSchema,
+  tablePrefix = ""
+)
+
+ +
+

Arguments

+
connectionDetails
+

DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails]

+ + +
databaseSchema
+

The schema on the server where the tables will be created.

+ + +
tablePrefix
+

(Optional) string to insert before table names for database table names

+ +
+
+

Details

+

Only PostgreSQL and SQLite servers are supported.

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createSubsetCohortWindow.html b/docs/reference/createSubsetCohortWindow.html index 27f90e4..08f5230 100644 --- a/docs/reference/createSubsetCohortWindow.html +++ b/docs/reference/createSubsetCohortWindow.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/dropCohortStatsTables.html b/docs/reference/dropCohortStatsTables.html index 4c9e8cd..bd0a8b9 100644 --- a/docs/reference/dropCohortStatsTables.html +++ b/docs/reference/dropCohortStatsTables.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/exportCohortStatsTables.html b/docs/reference/exportCohortStatsTables.html index 258095b..363565d 100644 --- a/docs/reference/exportCohortStatsTables.html +++ b/docs/reference/exportCohortStatsTables.html @@ -1,7 +1,12 @@ Export the cohort statistics tables to the file system — exportCohortStatsTables • CohortGenerator @@ -19,7 +24,7 @@ CohortGenerator - 0.9.0 + 0.10.0 @@ -72,7 +77,12 @@

Export the cohort statistics tables to the file system

This function retrieves the data from the cohort statistics tables and writes them to the inclusion statistics folder specified in the function -call.

+call. NOTE: inclusion rule names are handled in one of two ways:

+

1. You can specify the cohortDefinitionSet parameter and the inclusion rule +names will be extracted from the data.frame. +2. You can insert the inclusion rule names into the database using the +insertInclusionRuleNames function of this package.

+

The first approach is preferred as to avoid the warning emitted.

@@ -85,7 +95,9 @@

Export the cohort statistics tables to the file system

snakeCaseToCamelCase = TRUE, fileNamesInSnakeCase = FALSE, incremental = FALSE, - databaseId = NULL + databaseId = NULL, + cohortDefinitionSet = NULL, + tablePrefix = "" )
@@ -140,6 +152,28 @@

Arguments

Optional - when specified, the databaseId will be added to the exported results

+ +
cohortDefinitionSet
+

The cohortDefinitionSet argument must be a data frame with +the following columns:

cohortId
+

The unique integer identifier of the cohort

+ +
cohortName
+

The cohort's name

+ +
sql
+

The OHDSI-SQL used to generate the cohort

+ +

Optionally, this data frame may contain:

json
+

The Circe JSON representation of the cohort

+ +
+ + +
tablePrefix
+

Optional - allows to append a prefix to the exported +file names.

+ diff --git a/docs/reference/generateCohortSet.html b/docs/reference/generateCohortSet.html index 3a73337..6180c55 100644 --- a/docs/reference/generateCohortSet.html +++ b/docs/reference/generateCohortSet.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/generateNegativeControlOutcomeCohorts.html b/docs/reference/generateNegativeControlOutcomeCohorts.html index 3d94e46..b217dfb 100644 --- a/docs/reference/generateNegativeControlOutcomeCohorts.html +++ b/docs/reference/generateNegativeControlOutcomeCohorts.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/getCohortCounts.html b/docs/reference/getCohortCounts.html index 51ddd6b..1ed50e2 100644 --- a/docs/reference/getCohortCounts.html +++ b/docs/reference/getCohortCounts.html @@ -20,7 +20,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/getCohortDefinitionSet.html b/docs/reference/getCohortDefinitionSet.html index 6fabd6b..2424a59 100644 --- a/docs/reference/getCohortDefinitionSet.html +++ b/docs/reference/getCohortDefinitionSet.html @@ -20,7 +20,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/getCohortInclusionRules.html b/docs/reference/getCohortInclusionRules.html new file mode 100644 index 0000000..8dea65b --- /dev/null +++ b/docs/reference/getCohortInclusionRules.html @@ -0,0 +1,123 @@ + +Get Cohort Inclusion Rules from a cohort definition set — getCohortInclusionRules • CohortGenerator + + +
+
+ + + +
+
+ + +
+

This function returns a data frame of the inclusion rules defined +in a cohort definition set.

+
+ +
+
getCohortInclusionRules(cohortDefinitionSet)
+
+ +
+

Arguments

+
cohortDefinitionSet
+

The cohortDefinitionSet argument must be a data frame with +the following columns:

cohortId
+

The unique integer identifier of the cohort

+ +
cohortName
+

The cohort's name

+ +
sql
+

The OHDSI-SQL used to generate the cohort

+ +

Optionally, this data frame may contain:

json
+

The Circe JSON representation of the cohort

+ +
+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/getCohortStats.html b/docs/reference/getCohortStats.html index 9665f7a..5fa025c 100644 --- a/docs/reference/getCohortStats.html +++ b/docs/reference/getCohortStats.html @@ -27,7 +27,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/getCohortTableNames.html b/docs/reference/getCohortTableNames.html index 24eaf0a..352585b 100644 --- a/docs/reference/getCohortTableNames.html +++ b/docs/reference/getCohortTableNames.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/getDataMigrator.html b/docs/reference/getDataMigrator.html new file mode 100644 index 0000000..6c29602 --- /dev/null +++ b/docs/reference/getDataMigrator.html @@ -0,0 +1,122 @@ + +Get database migrations instance — getDataMigrator • CohortGenerator + + +
+
+ + + +
+
+ + +
+

Returns ResultModelManager DataMigrationsManager instance.

+
+ +
+
getDataMigrator(connectionDetails, databaseSchema, tablePrefix = "")
+
+ +
+

Arguments

+
connectionDetails
+

DatabaseConnector connection details object

+ + +
databaseSchema
+

String schema where database schema lives

+ + +
tablePrefix
+

(Optional) Use if a table prefix is used before table names (e.g. "cg_")

+ +
+
+

Value

+ + +

Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/getRequiredTasks.html b/docs/reference/getRequiredTasks.html index e2c8f0e..84010df 100644 --- a/docs/reference/getRequiredTasks.html +++ b/docs/reference/getRequiredTasks.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/getResultsDataModelSpecifications.html b/docs/reference/getResultsDataModelSpecifications.html new file mode 100644 index 0000000..6f65545 --- /dev/null +++ b/docs/reference/getResultsDataModelSpecifications.html @@ -0,0 +1,108 @@ + +Get specifications for CohortGenerator results data model — getResultsDataModelSpecifications • CohortGenerator + + +
+
+ + + +
+
+ + +
+

Get specifications for CohortGenerator results data model

+
+ +
+
getResultsDataModelSpecifications()
+
+ +
+

Value

+ + +

A tibble data frame object with specifications

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/getSubsetDefinitions.html b/docs/reference/getSubsetDefinitions.html index 7564da7..95f1d53 100644 --- a/docs/reference/getSubsetDefinitions.html +++ b/docs/reference/getSubsetDefinitions.html @@ -21,7 +21,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/index.html b/docs/reference/index.html index f8b1ac2..04a3217 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0 @@ -66,6 +66,18 @@

Reference

+ + + + + @@ -101,18 +113,6 @@

Cohort Defintion Set isCohortDefinitionSet()

- - - - - - - - - @@ -157,38 +149,54 @@

Cohort Subset createSubsetCohortWindow()

- - - - + + + + + + + + + - + - + + + @@ -213,6 +221,30 @@

Negative Control Outcomes generateNegativeControlOutcomeCohorts()

+ + + + + + + + + + +
+

Cohort Generation

+

Functions that support generating cohorts.

+
+

runCohortGeneration()

+

Run a cohort generation and export results

+

generateCohortSet()

+

Generate a set of cohorts

Cohort Tables

Functions that support creating the necessary cohort tables.

Is the data.frame a cohort definition set?

-

Cohort Generation

-

Functions that support generating cohorts.

-
-

generateCohortSet()

-

Generate a set of cohorts

-

createEmptyCohortDefinitionSet()

-

Create an empty cohort definition set

Cohort Counts

Function for obtaining the counts of subjects and events for one or more cohorts

@@ -122,21 +122,13 @@

Cohort Counts

Count the cohort(s)

-

Cohort Subset

-

Functions and R6 classes for creating cohort subset definitions and subset operators.

+

Cohort Subset Functions

+

Functions for creating cohort subset definitions and subset operators.

addCohortSubsetDefinition()

Add cohort subset definition to a cohort definition set

-

CohortSubsetDefinition

-

Cohort Subset Definition

-

CohortSubsetOperator

-

Cohort Subset Operator

createCohortSubset()

A definition of subset functions to be applied to a set of cohorts

-

DemographicSubsetOperator

-

Criteria Subset

getSubsetDefinitions()

Get cohort subset definitions from a cohort definition set

-

LimitSubsetOperator

-

Limit Subset Operator

saveCohortSubsetDefinition()

Save cohort subset definitions to json

+

Cohort Subset Classes

+

R6 classes for cohort subset definitions and subset operators.

+
+

CohortSubsetDefinition

+

Cohort Subset Definition

+

CohortSubsetOperator

+

Cohort Subset Operator

+

DemographicSubsetOperator

+

Demographic Subset Operator

+

LimitSubsetOperator

+

Limit Subset Operator

SubsetCohortWindow

to List

Time Window For Cohort Subset Operator

SubsetOperator

SubsetOperator

Abstract base class for subsets.

Cohort Statistics

-

Functions for inserting inclusion rule names from a cohort definition, exporting the cohort statistics to the file system and a helper function for dropping those tables when they are no longer needed.

+

Functions for inserting inclusion rule names from a cohort definition, exporting the cohort statistics to the file system and a helper function for dropping those tables when they are no longer needed. These functions assume you are using Circe for inclusion rules and cohort statistics.

getCohortStats()

Get Cohort Inclusion Stats Table Data

+

getCohortInclusionRules()

+

Get Cohort Inclusion Rules from a cohort definition set

insertInclusionRuleNames()

Generate a set of negative control outcome cohorts

+

Result Model Management

+

Functions for managing the results of running Cohort Generator via runCohortGeneration

+
+

createResultsDataModel()

+

Create the results data model tables on a database server.

+

getDataMigrator()

+

Get database migrations instance

+

getResultsDataModelSpecifications()

+

Get specifications for CohortGenerator results data model

+

migrateDataModel()

+

Migrate Data model

+

uploadResults()

+

Upload results to the database server.

CSV File Helpers

Functions for reading and writing CSV files to ensure adherance to the HADES standard when interfacing between R and SQL/File System: https://ohdsi.github.io/Hades/codeStyle.html#Interfacing_between_R_and_SQL

diff --git a/docs/reference/insertInclusionRuleNames.html b/docs/reference/insertInclusionRuleNames.html index 4e47142..2e7edc1 100644 --- a/docs/reference/insertInclusionRuleNames.html +++ b/docs/reference/insertInclusionRuleNames.html @@ -21,7 +21,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/isCamelCase.html b/docs/reference/isCamelCase.html index 6c716f1..180c5bd 100644 --- a/docs/reference/isCamelCase.html +++ b/docs/reference/isCamelCase.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/isCohortDefinitionSet.html b/docs/reference/isCohortDefinitionSet.html index 310f356..fdcd446 100644 --- a/docs/reference/isCohortDefinitionSet.html +++ b/docs/reference/isCohortDefinitionSet.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/isFormattedForDatabaseUpload.html b/docs/reference/isFormattedForDatabaseUpload.html index 0bc0021..f8b9a68 100644 --- a/docs/reference/isFormattedForDatabaseUpload.html +++ b/docs/reference/isFormattedForDatabaseUpload.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/isSnakeCase.html b/docs/reference/isSnakeCase.html index 08554da..985e571 100644 --- a/docs/reference/isSnakeCase.html +++ b/docs/reference/isSnakeCase.html @@ -18,7 +18,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/isTaskRequired.html b/docs/reference/isTaskRequired.html index 3a2dfa3..75332da 100644 --- a/docs/reference/isTaskRequired.html +++ b/docs/reference/isTaskRequired.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/migrateDataModel.html b/docs/reference/migrateDataModel.html new file mode 100644 index 0000000..772c00e --- /dev/null +++ b/docs/reference/migrateDataModel.html @@ -0,0 +1,120 @@ + +Migrate Data model — migrateDataModel • CohortGenerator + + +
+
+ + + +
+
+ + +
+

Migrate data from current state to next state

+

It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +are using a postgres backend) or have kept the csv/zip files from your data generation.

+
+ +
+
migrateDataModel(connectionDetails, databaseSchema, tablePrefix = "")
+
+ +
+

Arguments

+
connectionDetails
+

DatabaseConnector connection details object

+ + +
databaseSchema
+

String schema where database schema lives

+ + +
tablePrefix
+

(Optional) Use if a table prefix is used before table names (e.g. "cg_")

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/readCsv.html b/docs/reference/readCsv.html index cee3cec..42ec1e1 100644 --- a/docs/reference/readCsv.html +++ b/docs/reference/readCsv.html @@ -21,7 +21,7 @@ CohortGenerator - 0.9.0 + 0.10.0 @@ -80,7 +80,7 @@

Used to read a .csv file

-
readCsv(file, warnOnCaseMismatch = TRUE)
+
readCsv(file, warnOnCaseMismatch = TRUE, colTypes = readr::cols())
@@ -93,6 +93,34 @@

Arguments

When TRUE, raise a warning if column headings in the .csv are not in snake_case format

+ +
colTypes
+

Corresponds to the `col_types` in the `readr::read_csv` function. + One of `NULL`, a [readr::cols()] specification, or + a string. See `vignette("readr")` for more details.

+

If `NULL`, all column types will be inferred from `guess_max` rows of the + input, interspersed throughout the file. This is convenient (and fast), + but not robust. If the guessed types are wrong, you'll need to increase + `guess_max` or supply the correct types yourself.

+

Column specifications created by [list()] or [cols()] must contain + one column specification for each column.

+

Alternatively, you can use a compact string representation where each + character represents one column: + - c = character + - i = integer + - n = number + - d = double + - l = logical + - f = factor + - D = date + - T = date time + - t = time + - ? = guess + - _ or - = skip

+

By default, reading a file without a column specification will print a + message showing what `readr` guessed they were. To remove this message, + set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`.

+

Value

diff --git a/docs/reference/recordTasksDone.html b/docs/reference/recordTasksDone.html index b934c79..056c5aa 100644 --- a/docs/reference/recordTasksDone.html +++ b/docs/reference/recordTasksDone.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/runCohortGeneration.html b/docs/reference/runCohortGeneration.html new file mode 100644 index 0000000..49066fb --- /dev/null +++ b/docs/reference/runCohortGeneration.html @@ -0,0 +1,229 @@ + +Run a cohort generation and export results — runCohortGeneration • CohortGenerator + + +
+
+ + + +
+
+ + +
+

Run a cohort generation and export results

+
+ +
+
runCohortGeneration(
+  connectionDetails,
+  cdmDatabaseSchema,
+  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+  cohortDatabaseSchema = cdmDatabaseSchema,
+  cohortTableNames = getCohortTableNames(),
+  cohortDefinitionSet = NULL,
+  negativeControlOutcomeCohortSet = NULL,
+  occurrenceType = "all",
+  detectOnDescendants = FALSE,
+  stopOnError = TRUE,
+  outputFolder,
+  databaseId = 1,
+  incremental = FALSE,
+  incrementalFolder = NULL
+)
+
+ +
+

Arguments

+
connectionDetails
+

An object of type connectionDetails as created using the +createConnectionDetails function in the +DatabaseConnector package.

+ + +
cdmDatabaseSchema
+

Schema name where your patient-level data in OMOP CDM format resides. +Note that for SQL Server, this should include both the database and +schema name, for example 'cdm_data.dbo'.

+ + +
tempEmulationSchema
+

Some database platforms like Oracle and Impala do not truly support +temp tables. To emulate temp tables, provide a schema with write +privileges where temp tables can be created.

+ + +
cohortDatabaseSchema
+

Schema name where your cohort tables reside. Note that for SQL Server, +this should include both the database and schema name, for example +'scratch.dbo'.

+ + +
cohortTableNames
+

The names of the cohort tables. See getCohortTableNames +for more details.

+ + +
cohortDefinitionSet
+

The cohortDefinitionSet argument must be a data frame with +the following columns:

cohortId
+

The unique integer identifier of the cohort

+ +
cohortName
+

The cohort's name

+ +
sql
+

The OHDSI-SQL used to generate the cohort

+ +

Optionally, this data frame may contain:

json
+

The Circe JSON representation of the cohort

+ +
+ + +
negativeControlOutcomeCohortSet
+

The negativeControlOutcomeCohortSet argument must be a data frame with +the following columns:

cohortId
+

The unique integer identifier of the cohort

+ +
cohortName
+

The cohort's name

+ +
outcomeConceptId
+

The concept_id in the condition domain to use for the negative control outcome.

+ +
+ + +
occurrenceType
+

For negative controls outcomes, the occurrenceType +will detect either: the first time an +outcomeConceptId occurs or all times the +outcomeConceptId occurs for a person. Values +accepted: 'all' or 'first'.

+ + +
detectOnDescendants
+

For negative controls outcomes, when set to TRUE, +detectOnDescendants will use the vocabulary to +find negative control outcomes using the +outcomeConceptId and all descendants via the +concept_ancestor table. When FALSE, only the exact +outcomeConceptId will be used to detect the +outcome.

+ + +
stopOnError
+

If an error happens while generating one of the +cohorts in the cohortDefinitionSet, should we +stop processing the other cohorts? The default is +TRUE; when set to FALSE, failures will be +identified in the return value from this function.

+ + +
outputFolder
+

Name of the folder where all the outputs will written to.

+ + +
databaseId
+

A unique ID for the database. This will be appended to +most tables.

+ + +
incremental
+

Create only cohorts that haven't been created before?

+ + +
incrementalFolder
+

If incremental = TRUE, specify a folder where +records are kept of which definition has been +executed.

+ +
+
+

Details

+

Run a cohort generation for a set of cohorts and negative control outcomes. +This function will also export the results of the run to the `outputFolder`.

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/sampleCohortDefinitionSet.html b/docs/reference/sampleCohortDefinitionSet.html index 41c171e..c6566eb 100644 --- a/docs/reference/sampleCohortDefinitionSet.html +++ b/docs/reference/sampleCohortDefinitionSet.html @@ -24,7 +24,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/saveCohortDefinitionSet.html b/docs/reference/saveCohortDefinitionSet.html index 22c8814..94e3d22 100644 --- a/docs/reference/saveCohortDefinitionSet.html +++ b/docs/reference/saveCohortDefinitionSet.html @@ -23,7 +23,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/reference/saveCohortSubsetDefinition.html b/docs/reference/saveCohortSubsetDefinition.html index d2291c1..98862b5 100644 --- a/docs/reference/saveCohortSubsetDefinition.html +++ b/docs/reference/saveCohortSubsetDefinition.html @@ -17,7 +17,7 @@ CohortGenerator - 0.9.0 + 0.10.0 @@ -81,7 +81,7 @@

Save cohort subset definitions to json

Arguments

subsetDefinition
-

The subset definition object @seealso CohortSubsetDefinition

+

The subset definition object @seealso[CohortSubsetDefinition]

subsetJsonFolder
diff --git a/docs/reference/saveIncremental.html b/docs/reference/saveIncremental.html index 0f48925..6914fc8 100644 --- a/docs/reference/saveIncremental.html +++ b/docs/reference/saveIncremental.html @@ -19,7 +19,7 @@ CohortGenerator - 0.9.0 + 0.10.0
diff --git a/docs/reference/uploadResults.html b/docs/reference/uploadResults.html new file mode 100644 index 0000000..cf7024b --- /dev/null +++ b/docs/reference/uploadResults.html @@ -0,0 +1,146 @@ + +Upload results to the database server. — uploadResults • CohortGenerator + + +
+
+ + + +
+
+ + +
+

Requires the results data model tables have been created using the createResultsDataModel function.

+
+ +
+
uploadResults(
+  connectionDetails,
+  schema,
+  resultsFolder,
+  forceOverWriteOfSpecifications = FALSE,
+  purgeSiteDataBeforeUploading = TRUE,
+  tablePrefix = "",
+  ...
+)
+
+ +
+

Arguments

+
connectionDetails
+

An object of type connectionDetails as created using the +createConnectionDetails function in the +DatabaseConnector package.

+ + +
schema
+

The schema on the server where the tables have been created.

+ + +
resultsFolder
+

The folder holding the results in .csv files

+ + +
forceOverWriteOfSpecifications
+

If TRUE, specifications of the phenotypes, cohort definitions, and analysis +will be overwritten if they already exist on the database. Only use this if these specifications +have changed since the last upload.

+ + +
purgeSiteDataBeforeUploading
+

If TRUE, before inserting data for a specific databaseId all the data for +that site will be dropped. This assumes the resultsFolder file contains the full data for that +data site.

+ + +
tablePrefix
+

(Optional) string to insert before table names for database table names

+ + +
...
+

See ResultModelManager::uploadResults

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/writeCsv.html b/docs/reference/writeCsv.html index 15328a3..f98829e 100644 --- a/docs/reference/writeCsv.html +++ b/docs/reference/writeCsv.html @@ -28,7 +28,7 @@ CohortGenerator - 0.9.0 + 0.10.0 diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 3808b80..b810c7d 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -72,6 +72,9 @@ /reference/createLimitSubset.html + + /reference/createResultsDataModel.html + /reference/createSubsetCohortWindow.html @@ -105,15 +108,24 @@ /reference/getCohortDefinitionSet.html + + /reference/getCohortInclusionRules.html + /reference/getCohortStats.html /reference/getCohortTableNames.html + + /reference/getDataMigrator.html + /reference/getRequiredTasks.html + + /reference/getResultsDataModelSpecifications.html + /reference/getSubsetDefinitions.html @@ -141,12 +153,18 @@ /reference/LimitSubsetOperator.html + + /reference/migrateDataModel.html + /reference/readCsv.html /reference/recordTasksDone.html + + /reference/runCohortGeneration.html + /reference/sampleCohortDefinitionSet.html @@ -168,6 +186,9 @@ /reference/SubsetOperator.html + + /reference/uploadResults.html + /reference/writeCsv.html diff --git a/extras/CohortGenerator.pdf b/extras/CohortGenerator.pdf index 0822960..ea4a379 100644 Binary files a/extras/CohortGenerator.pdf and b/extras/CohortGenerator.pdf differ diff --git a/extras/CreateResultsForUpload.R b/extras/CreateResultsForUpload.R new file mode 100644 index 0000000..9b5f232 --- /dev/null +++ b/extras/CreateResultsForUpload.R @@ -0,0 +1,203 @@ +library(CohortGenerator) +resultsFolder <- "D:/TEMP/cg" +databaseId <- "Eunomia" +tablePrefix = "cg_" + +if (!dir.exists(resultsFolder)) { + dir.create(path = resultsFolder, recursive = TRUE) +} + +# First construct a cohort definition set: an empty +# data frame with the cohorts to generate +cohortDefinitionSet <- CohortGenerator::createEmptyCohortDefinitionSet() + +# Fill the cohort set using cohorts included in this +# package as an example +cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", package = "CohortGenerator"), full.names = TRUE) +for (i in 1:length(cohortJsonFiles)) { + cohortJsonFileName <- cohortJsonFiles[i] + cohortName <- tools::file_path_sans_ext(basename(cohortJsonFileName)) + # Here we read in the JSON in order to create the SQL + # using [CirceR](https://ohdsi.github.io/CirceR/) + # If you have your JSON and SQL stored differenly, you can + # modify this to read your JSON/SQL files however you require + cohortJson <- readChar(cohortJsonFileName, file.info(cohortJsonFileName)$size) + cohortExpression <- CirceR::cohortExpressionFromJson(cohortJson) + cohortSql <- CirceR::buildCohortQuery(cohortExpression, options = CirceR::createGenerateOptions(generateStats = TRUE)) + cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(cohortId = as.numeric(i), + cohortName = cohortName, + json = cohortJson, + sql = cohortSql, + stringsAsFactors = FALSE)) +} + +subsetOperations <- list( + createDemographicSubset( + name = "Demographic Criteria 1", + ageMin = 18, + ageMax = 64 + ), + createDemographicSubset( + name = "Demographic Criteria 2", + ageMin = 32, + ageMax = 48 + ) +) +subsetDef <- createCohortSubsetDefinition( + name = "test definition 123", + definitionId = 1, + subsetOperators = subsetOperations, + subsetCohortNameTemplate = "FOOO @baseCohortName @subsetDefinitionName @operatorNames", + operatorNameConcatString = "zzzz" +) + +cohortDefinitionSet <- cohortDefinitionSet |> + CohortGenerator::addCohortSubsetDefinition(subsetDef) + +# Massage and save the cohort definition set +colsToRename <- c("cohortId", "cohortName", "sql", "json") +colInd <- which(names(cohortDefinitionSet) %in% colsToRename) +cohortDefinitions <- cohortDefinitionSet +names(cohortDefinitions)[colInd] <- c("cohortDefinitionId", "cohortName", "sqlCommand", "json") +cohortDefinitions$description <- "" +CohortGenerator::writeCsv( + x = cohortDefinitions, + file = file.path(resultsFolder, "cohort_definition.csv") +) + +# Export the subsets +subsets <- CohortGenerator::getSubsetDefinitions(cohortDefinitionSet) +if (length(subsets)) { + dfs <- lapply(subsets, function(x) { + data.frame(subsetDefinitionId = x$definitionId, json = as.character(x$toJSON())) + }) + subsetDefinitions <- data.frame() + for (subsetDef in dfs) { + subsetDefinitions <- rbind(subsetDefinitions, subsetDef) + } + + CohortGenerator::writeCsv( + x = subsetDefinitions, + file = file.path(resultsFolder, "cohort_subset_definition.csv") + ) +} + + +# Generate the cohort set against Eunomia. +# cohortsGenerated contains a list of the cohortIds +# successfully generated against the CDM +connectionDetails <- Eunomia::getEunomiaConnectionDetails() + +# Create the cohort tables to hold the cohort generation results +cohortTableNames <- CohortGenerator::getCohortTableNames(cohortTable = "my_cohort_table") +CohortGenerator::createCohortTables(connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames) +# Generate the cohorts +cohortsGenerated <- CohortGenerator::generateCohortSet(connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortDefinitionSet = cohortDefinitionSet) + + +cohortsGenerated$databaseId <- databaseId +CohortGenerator::writeCsv( + x = cohortsGenerated, + file = file.path(resultsFolder, "cohort_generation.csv") +) + +# Get the cohort counts +cohortCounts <- CohortGenerator::getCohortCounts(connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTable = cohortTableNames$cohortTable, + databaseId = databaseId) +cohortCounts <- cohortCounts[c("databaseId", "cohortId", "cohortEntries", "cohortSubjects")] + +CohortGenerator::writeCsv( + x = cohortCounts, + file = file.path(resultsFolder, "cohort_count.csv") +) + +# Generate the negative controls +negativeControlOutcomes <- readCsv(file = system.file("testdata/negativecontrols/negativeControlOutcomes.csv", + package = "CohortGenerator", + mustWork = TRUE +)) +negativeControlOutcomes$cohortId <- negativeControlOutcomes$outcomeConceptId + + +CohortGenerator::generateNegativeControlOutcomeCohorts( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTable = cohortTableNames$cohortTable, + negativeControlOutcomeCohortSet = negativeControlOutcomes, + occurrenceType = "all", + detectOnDescendants = TRUE, + incremental = F +) + +cohortCountsNegativeControlOutcomes <- CohortGenerator::getCohortCounts( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTable = cohortTableNames$cohortTable, + databaseId = databaseId, + cohortIds = negativeControlOutcomes$cohortId +) + +CohortGenerator::writeCsv( + x = cohortCountsNegativeControlOutcomes, + file = file.path(resultsFolder, "cohort_count_neg_ctrl.csv") +) + +CohortGenerator::exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortStatisticsFolder = resultsFolder, + cohortDatabaseSchema = "main", + databaseId = databaseId, + snakeCaseToCamelCase = FALSE, + fileNamesInSnakeCase = TRUE, + cohortTableNames = cohortTableNames, + cohortDefinitionSet = cohortDefinitionSet +) + +# Set the table names in resultsDataModelSpecification.csv +resultsDataModel <- CohortGenerator::getResultsDataModelSpecifications() +oldTableNames <- gsub("cg_", "", resultsDataModel$tableName) +file.rename( + file.path(resultsFolder, paste0(unique(oldTableNames), ".csv")), + file.path(resultsFolder, paste0(unique(resultsDataModel$tableName), ".csv")) +) +CohortGenerator::writeCsv( + x = resultsDataModel, + file = file.path(resultsFolder, "resultsDataModelSpecification.csv"), + warnOnCaseMismatch = FALSE, + warnOnFileNameCaseMismatch = FALSE, + warnOnUploadRuleViolations = FALSE +) + + +# Test the upload +CohortGenerator::createResultsDataModel( + connectionDetails = connectionDetails, + databaseSchema = "main", +) + +CohortGenerator::uploadResults( + connectionDetails = connectionDetails, + schema = "main", + resultsFolder = resultsFolder, + purgeSiteDataBeforeUploading = F +) + +#conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) + +zip::zip( + zipfile = file.path("inst", "testdata", "Results_Eunomia.zip"), + files = list.files(resultsFolder, full.names = T), + mode = "cherry-pick" +) + +unlink(connectionDetails$server()) +unlink(resultsFolder, recursive = TRUE) diff --git a/inst/cohortDefinitionSetSpecificationDescription.csv b/inst/cohortDefinitionSetSpecificationDescription.csv deleted file mode 100644 index 5337d98..0000000 --- a/inst/cohortDefinitionSetSpecificationDescription.csv +++ /dev/null @@ -1,5 +0,0 @@ -column_name,description,data_type -cohortId,The identifier for the cohort in the cohort definition set.,numeric -cohortName,The name of the cohort in the cohort definition set.,character -sql,The SQL code used to construct the cohort,character -json,The optional Circe compliant JSON representation of the cohort definition.,character diff --git a/inst/csv/resultsDataModelSpecification.csv b/inst/csv/resultsDataModelSpecification.csv new file mode 100644 index 0000000..9eec42d --- /dev/null +++ b/inst/csv/resultsDataModelSpecification.csv @@ -0,0 +1,54 @@ +table_name,column_name,data_type,is_required,primary_key,min_cell_count,description +cg_cohort_definition,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_definition,cohort_name,varchar,Yes,No,No,The name of the cohort definition +cg_cohort_definition,description,varchar,No,No,No,A description of the cohort definition +cg_cohort_definition,json,text,No,No,No,The circe-be compiliant JSON expression +cg_cohort_definition,sql_command,text,No,No,No,The OHDSI-SQL command used to construct the cohort +cg_cohort_definition,subset_parent,bigint,No,No,No,The parent cohort id if this cohort is a subset +cg_cohort_definition,is_subset,int,No,No,No,This value is 1 when the cohort is a subset +cg_cohort_definition,subset_definition_id,bigint,No,No,No,The cohort subset definition +cg_cohort_generation,cohort_id,bigint,Yes,Yes,No,The uniqe identifier for the cohort definition +cg_cohort_generation,cohort_name,varchar,Yes,No,No,The name of the cohort generated +cg_cohort_generation,generation_status,varchar,No,No,No,The cohort generation status +cg_cohort_generation,start_time,Timestamp,No,No,No,The start time of the generation process +cg_cohort_generation,end_time,Timestamp,No,No,No,The end time of the generation process +cg_cohort_generation,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_inclusion,cohort_definition_id,bigint,bigint,Yes,No,The unique identifier for the cohort definition +cg_cohort_inclusion,rule_sequence,int,Yes,Yes,No,The rule sequence for the inclusion rule +cg_cohort_inclusion,name,varchar,Yes,Yes,No,The name of the inclusion rule +cg_cohort_inclusion,description,varchar,No,No,No,The description of the inclusion rule +cg_cohort_inc_result,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_inc_result,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_inc_result,inclusion_rule_mask,int,Yes,Yes,No,A bit-mask for the inclusion rule +cg_cohort_inc_result,person_count,bigint,Yes,Yes,Yes,The number of persons satisifying the inclusion rule +cg_cohort_inc_result,mode_id,int,Yes,Yes,No,The mode of the inclusion rule. +cg_cohort_inc_stats,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_inc_stats,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_inc_stats,rule_sequence,int,Yes,Yes,No,The rule sequence +cg_cohort_inc_stats,person_count,bigint,Yes,Yes,Yes,The person count +cg_cohort_inc_stats,gain_count,bigint,Yes,Yes,No,The gain count +cg_cohort_inc_stats,person_total,bigint,Yes,Yes,Yes,The person total +cg_cohort_inc_stats,mode_id,int,Yes,Yes,No,The mode id +cg_cohort_summary_stats,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_summary_stats,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_summary_stats,base_count,bigint,Yes,Yes,Yes,The base count +cg_cohort_summary_stats,final_count,bigint,Yes,Yes,Yes,The final count +cg_cohort_summary_stats,mode_id,int,Yes,Yes,No,The mode id +cg_cohort_censor_stats,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_censor_stats,lost_count,bigint,Yes,Yes,Yes,The number lost due to censoring +cg_cohort_censor_stats,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_count,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_count,cohort_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_count,cohort_entries,bigint,Yes,Yes,Yes,The number of cohort entries +cg_cohort_count,cohort_subjects,bigint,Yes,Yes,Yes,The number of unique subjects +cg_cohort_count_neg_ctrl,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cg_cohort_count_neg_ctrl,cohort_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cg_cohort_count_neg_ctrl,cohort_entries,bigint,Yes,Yes,Yes,The number of cohort entries +cg_cohort_count_neg_ctrl,cohort_subjects,bigint,Yes,Yes,Yes,The number of unique subjects +cg_cohort_subset_definition,subset_definition_id,bigint,Yes,Yes,No,The identifier for the cohort subset definition +cg_cohort_subset_definition,json,text,Yes,No,No,The JSON representation of the subset definition +cg_cohort_definition_neg_ctrl,cohort_id,bigint,Yes,Yes,No,The cohort identifier for the negative control outcome +cg_cohort_definition_neg_ctrl,outcome_concept_id,bigint,Yes,No,No,The concept ID for the negative control outcome +cg_cohort_definition_neg_ctrl,cohort_name,varchar,Yes,No,No,The concept name for the negative control outcome +cg_cohort_definition_neg_ctrl,occurrence_type,varchar,Yes,No,No,The occurrenceType will detect either: the first time an outcomeConceptId occurs or all times the outcome_concept_id occurs for a person. Values accepted: 'all' or 'first' +cg_cohort_definition_neg_ctrl,detect_on_descendants,int,Yes,No,No,When set to 1 detect_on_descendants used the vocabulary to find negative control outcomes using the outcome_concept_id and all descendants via the concept_ancestor table. When set to 0 only the exact outcome_concept_id was used to detect the outcome diff --git a/inst/doc/CreatingCohortSubsetDefinitions.pdf b/inst/doc/CreatingCohortSubsetDefinitions.pdf index 61986cb..cf698b5 100644 Binary files a/inst/doc/CreatingCohortSubsetDefinitions.pdf and b/inst/doc/CreatingCohortSubsetDefinitions.pdf differ diff --git a/inst/doc/GeneratingCohorts.pdf b/inst/doc/GeneratingCohorts.pdf index bcb7ad4..c6f8e97 100644 Binary files a/inst/doc/GeneratingCohorts.pdf and b/inst/doc/GeneratingCohorts.pdf differ diff --git a/inst/doc/SamplingCohorts.pdf b/inst/doc/SamplingCohorts.pdf index a27ffb3..829e798 100644 Binary files a/inst/doc/SamplingCohorts.pdf and b/inst/doc/SamplingCohorts.pdf differ diff --git a/inst/negativeControlOutcomeCohortSetSpecificationDescription.csv b/inst/negativeControlOutcomeCohortSetSpecificationDescription.csv deleted file mode 100644 index 6ce6b37..0000000 --- a/inst/negativeControlOutcomeCohortSetSpecificationDescription.csv +++ /dev/null @@ -1,4 +0,0 @@ -column_name,description,data_type -cohortId,The identifier for the cohort in the negative control outcome cohort set.,integer64 -cohortName,The name of the cohort in the negative control outcome cohort set.,character -outcomeConceptId,The concept_id used to construct the negative control cohort. This concept_id must be in the condition domain,integer64 diff --git a/inst/sql/sql_server/CreateResultsDataModel.sql b/inst/sql/sql_server/CreateResultsDataModel.sql new file mode 100644 index 0000000..9440b46 --- /dev/null +++ b/inst/sql/sql_server/CreateResultsDataModel.sql @@ -0,0 +1,80 @@ +CREATE TABLE @database_schema.@table_prefixcg_cohort_definition ( + cohort_definition_id BIGINT NOT NULL, + cohort_name VARCHAR, + description VARCHAR, + json TEXT, + sql_command TEXT, + subset_parent BIGINT, + is_subset INT, + subset_definition_id BIGINT, + PRIMARY KEY(cohort_definition_id) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_generation ( + cohort_id BIGINT NOT NULL, + cohort_name VARCHAR, + generation_status VARCHAR, + start_time TIMESTAMP, + end_time TIMESTAMP, + database_id VARCHAR NOT NULL, + PRIMARY KEY(cohort_id,database_id) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_inclusion ( + cohort_definition_id BIGINT NOT NULL, + rule_sequence INT NOT NULL, + name VARCHAR NOT NULL, + description VARCHAR, + PRIMARY KEY(cohort_definition_id,rule_sequence,name) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_inc_result ( + database_id VARCHAR NOT NULL, + cohort_definition_id BIGINT NOT NULL, + inclusion_rule_mask INT NOT NULL, + person_count BIGINT NOT NULL, + mode_id INT NOT NULL, + PRIMARY KEY(database_id,cohort_definition_id,inclusion_rule_mask,person_count,mode_id) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_inc_stats ( + database_id VARCHAR NOT NULL, + cohort_definition_id BIGINT NOT NULL, + rule_sequence INT NOT NULL, + person_count BIGINT NOT NULL, + gain_count BIGINT NOT NULL, + person_total BIGINT NOT NULL, + mode_id INT NOT NULL, + PRIMARY KEY(database_id,cohort_definition_id,rule_sequence,person_count,gain_count,person_total,mode_id) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_summary_stats ( + database_id VARCHAR NOT NULL, + cohort_definition_id BIGINT NOT NULL, + base_count BIGINT NOT NULL, + final_count BIGINT NOT NULL, + mode_id INT NOT NULL, + PRIMARY KEY(database_id,cohort_definition_id,base_count,final_count,mode_id) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_censor_stats ( + cohort_definition_id BIGINT NOT NULL, + lost_count BIGINT NOT NULL, + PRIMARY KEY(cohort_definition_id,lost_count) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_count ( + database_id VARCHAR NOT NULL, + cohort_id BIGINT NOT NULL, + cohort_entries BIGINT NOT NULL, + cohort_subjects BIGINT NOT NULL, + PRIMARY KEY(database_id,cohort_id,cohort_entries,cohort_subjects) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_count_neg_ctrl ( + database_id VARCHAR NOT NULL, + cohort_id BIGINT NOT NULL, + cohort_entries BIGINT NOT NULL, + cohort_subjects BIGINT NOT NULL, + PRIMARY KEY(database_id,cohort_id,cohort_entries,cohort_subjects) +); diff --git a/inst/sql/sql_server/NegativeControlOutcomes.sql b/inst/sql/sql_server/NegativeControlOutcomes.sql index 9fa8566..c293af2 100644 --- a/inst/sql/sql_server/NegativeControlOutcomes.sql +++ b/inst/sql/sql_server/NegativeControlOutcomes.sql @@ -1,4 +1,5 @@ --- NOTE: #nc_set is created by R before calling this SQL code +@nc_set_query + CREATE TABLE #Codesets ( cohort_definition_id bigint NOT NULL, ancestor_concept_id int NOT NULL, diff --git a/inst/sql/sql_server/migrations/Migration_1-v0.10.0.sql b/inst/sql/sql_server/migrations/Migration_1-v0.10.0.sql new file mode 100644 index 0000000..325dcf0 --- /dev/null +++ b/inst/sql/sql_server/migrations/Migration_1-v0.10.0.sql @@ -0,0 +1,20 @@ +-- Database migrations for version 0.10.0 +-- Adds missing database_id to cg_cohort_censor_stats +-- Adds new table cg_cohort_subset_definition to hold the subset definitions +-- Adds new table cg_cohort_definition_neg_ctrl to hold the negative control outcomes cohort definitions +ALTER TABLE @database_schema.@table_prefixcg_cohort_censor_stats ADD database_id VARCHAR; + +CREATE TABLE @database_schema.@table_prefixcg_cohort_subset_definition ( + subset_definition_id BIGINT, + json varchar, + PRIMARY KEY(subset_definition_id) +); + +CREATE TABLE @database_schema.@table_prefixcg_cohort_definition_neg_ctrl ( + cohort_id BIGINT, + outcome_concept_id BIGINT, + cohort_name varchar, + occurrence_type varchar, + detect_on_descendants int, + PRIMARY KEY(cohort_id) +); diff --git a/inst/sql/sql_server/subsets/CohortSubsetOperator.sql b/inst/sql/sql_server/subsets/CohortSubsetOperator.sql index 892fb5f..d93448d 100644 --- a/inst/sql/sql_server/subsets/CohortSubsetOperator.sql +++ b/inst/sql/sql_server/subsets/CohortSubsetOperator.sql @@ -20,6 +20,7 @@ FROM ( HAVING COUNT (DISTINCT S.COHORT_DEFINITION_ID) >= @subset_length ) A {@negate == '1'}?{ -RIGHT JOIN @target_table B ON B.subject_id = A.subject_id +RIGHT JOIN @target_table B ON B.subject_id = A.subject_id + AND b.cohort_start_date = a.cohort_start_date WHERE A.subject_id IS NULL } diff --git a/inst/testdata/Results_Eunomia.zip b/inst/testdata/Results_Eunomia.zip new file mode 100644 index 0000000..4beb4c7 Binary files /dev/null and b/inst/testdata/Results_Eunomia.zip differ diff --git a/inst/testdata/id/Cohorts.csv b/inst/testdata/id/Cohorts.csv index 33045c7..78c9a72 100644 --- a/inst/testdata/id/Cohorts.csv +++ b/inst/testdata/id/Cohorts.csv @@ -1,4 +1,5 @@ atlas_id,cohort_id,cohort_name,logic_description,generate_stats 10,1,celecoxib,Celecoxib exposure,FALSE 11,2,celecoxibAge40,Celecoxib exposure and age at index >= 40,FALSE -12,3,celecoxibAge40Male,Celecoxib exposure in males and age at index >= 40,FALSE \ No newline at end of file +12,3,celecoxibAge40Male,Celecoxib exposure in males and age at index >= 40,FALSE +13,4,celecoxibCensored,Celecoxib exposure censored after 2000 Jan 1,FALSE diff --git a/inst/testdata/id/cohorts/4.json b/inst/testdata/id/cohorts/4.json new file mode 100644 index 0000000..5a56150 --- /dev/null +++ b/inst/testdata/id/cohorts/4.json @@ -0,0 +1,58 @@ +{ + "ConceptSets": [ + { + "id": 0, + "name": "Celecoxib", + "expression": { + "items": [ + { + "concept": { + "CONCEPT_CLASS_ID": "Ingredient", + "CONCEPT_CODE": "140587", + "CONCEPT_ID": 1118084, + "CONCEPT_NAME": "celecoxib", + "DOMAIN_ID": "Drug", + "INVALID_REASON": "V", + "INVALID_REASON_CAPTION": "Valid", + "STANDARD_CONCEPT": "S", + "STANDARD_CONCEPT_CAPTION": "Standard", + "VOCABULARY_ID": "RxNorm" + } + } + ] + } + } + ], + "PrimaryCriteria": { + "CriteriaList": [ + { + "DrugEra": { + "CodesetId": 0 + } + } + ], + "ObservationWindow": { + "PriorDays": 0, + "PostDays": 0 + }, + "PrimaryCriteriaLimit": { + "Type": "First" + } + }, + "QualifiedLimit": { + "Type": "First" + }, + "ExpressionLimit": { + "Type": "First" + }, + "InclusionRules": [], + "CensoringCriteria": [], + "CollapseSettings": { + "CollapseType": "ERA", + "EraPad": 0 + }, + "CensorWindow": { + "StartDate": "2000-01-01" + }, + "cdmVersionRange": ">=5.0.0" +} \ No newline at end of file diff --git a/inst/testdata/id/sql/sql_server/4.sql b/inst/testdata/id/sql/sql_server/4.sql new file mode 100644 index 0000000..8bcc719 --- /dev/null +++ b/inst/testdata/id/sql/sql_server/4.sql @@ -0,0 +1,284 @@ +CREATE TABLE #Codesets ( + codeset_id int NOT NULL, + concept_id bigint NOT NULL +) +; + +INSERT INTO #Codesets (codeset_id, concept_id) +SELECT 0 as codeset_id, c.concept_id FROM (select distinct I.concept_id FROM +( + select concept_id from @vocabulary_database_schema.CONCEPT where (concept_id in (1118084)) + +) I +) C; + +UPDATE STATISTICS #Codesets; + + +SELECT event_id, person_id, start_date, end_date, op_start_date, op_end_date, visit_occurrence_id +INTO #qualified_events +FROM +( + select pe.event_id, pe.person_id, pe.start_date, pe.end_date, pe.op_start_date, pe.op_end_date, row_number() over (partition by pe.person_id order by pe.start_date ASC) as ordinal, cast(pe.visit_occurrence_id as bigint) as visit_occurrence_id + FROM (-- Begin Primary Events +select P.ordinal as event_id, P.person_id, P.start_date, P.end_date, op_start_date, op_end_date, cast(P.visit_occurrence_id as bigint) as visit_occurrence_id +FROM +( + select E.person_id, E.start_date, E.end_date, + row_number() OVER (PARTITION BY E.person_id ORDER BY E.sort_date ASC, E.event_id) ordinal, + OP.observation_period_start_date as op_start_date, OP.observation_period_end_date as op_end_date, cast(E.visit_occurrence_id as bigint) as visit_occurrence_id + FROM + ( + -- Begin Drug Era Criteria +select C.person_id, C.drug_era_id as event_id, C.start_date, C.end_date, + CAST(NULL as bigint) as visit_occurrence_id,C.start_date as sort_date +from +( + select de.person_id,de.drug_era_id,de.drug_concept_id,de.drug_exposure_count,de.gap_days,de.drug_era_start_date as start_date, de.drug_era_end_date as end_date + FROM @cdm_database_schema.DRUG_ERA de +where de.drug_concept_id in (SELECT concept_id from #Codesets where codeset_id = 0) +) C + + +-- End Drug Era Criteria + + ) E + JOIN @cdm_database_schema.observation_period OP on E.person_id = OP.person_id and E.start_date >= OP.observation_period_start_date and E.start_date <= op.observation_period_end_date + WHERE DATEADD(day,0,OP.OBSERVATION_PERIOD_START_DATE) <= E.START_DATE AND DATEADD(day,0,E.START_DATE) <= OP.OBSERVATION_PERIOD_END_DATE +) P +WHERE P.ordinal = 1 +-- End Primary Events +) pe + +) QE + +; + +--- Inclusion Rule Inserts + +create table #inclusion_events (inclusion_rule_id bigint, + person_id bigint, + event_id bigint +); + +select event_id, person_id, start_date, end_date, op_start_date, op_end_date +into #included_events +FROM ( + SELECT event_id, person_id, start_date, end_date, op_start_date, op_end_date, row_number() over (partition by person_id order by start_date ASC) as ordinal + from + ( + select Q.event_id, Q.person_id, Q.start_date, Q.end_date, Q.op_start_date, Q.op_end_date, SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) as inclusion_rule_mask + from #qualified_events Q + LEFT JOIN #inclusion_events I on I.person_id = Q.person_id and I.event_id = Q.event_id + GROUP BY Q.event_id, Q.person_id, Q.start_date, Q.end_date, Q.op_start_date, Q.op_end_date + ) MG -- matching groups +{0 != 0}?{ + -- the matching group with all bits set ( POWER(2,# of inclusion rules) - 1 = inclusion_rule_mask + WHERE (MG.inclusion_rule_mask = POWER(cast(2 as bigint),0)-1) +} +) Results +WHERE Results.ordinal = 1 +; + + + +-- generate cohort periods into #final_cohort +select person_id, start_date, end_date +INTO #cohort_rows +from ( -- first_ends + select F.person_id, F.start_date, F.end_date + FROM ( + select I.event_id, I.person_id, I.start_date, CE.end_date, row_number() over (partition by I.person_id, I.event_id order by CE.end_date) as ordinal + from #included_events I + join ( -- cohort_ends +-- cohort exit dates +-- By default, cohort exit at the event's op end date +select event_id, person_id, op_end_date as end_date from #included_events + ) CE on I.event_id = CE.event_id and I.person_id = CE.person_id and CE.end_date >= I.start_date + ) F + WHERE F.ordinal = 1 +) FE; + + +select person_id, min(start_date) as start_date, DATEADD(day,-1 * 0, max(end_date)) as end_date +into #final_cohort +from ( + select person_id, start_date, end_date, sum(is_start) over (partition by person_id order by start_date, is_start desc rows unbounded preceding) group_idx + from ( + select person_id, start_date, end_date, + case when max(end_date) over (partition by person_id order by start_date rows between unbounded preceding and 1 preceding) >= start_date then 0 else 1 end is_start + from ( + select person_id, start_date, DATEADD(day,0,end_date) as end_date + from #cohort_rows + ) CR + ) ST +) GR +group by person_id, group_idx; + +DELETE FROM @target_database_schema.@target_cohort_table where cohort_definition_id = @target_cohort_id; +INSERT INTO @target_database_schema.@target_cohort_table (cohort_definition_id, subject_id, cohort_start_date, cohort_end_date) +select @target_cohort_id as cohort_definition_id, person_id, CASE WHEN start_date > DATEFROMPARTS(2000, 1, 1) THEN start_date ELSE DATEFROMPARTS(2000, 1, 1) END, end_date +FROM #final_cohort CO +WHERE CASE WHEN start_date > DATEFROMPARTS(2000, 1, 1) THEN start_date ELSE DATEFROMPARTS(2000, 1, 1) END <= end_date +; + +{1 != 0}?{ +-- BEGIN: Censored Stats + +delete from @results_database_schema.cohort_censor_stats where cohort_definition_id = @target_cohort_id; +-- calculate censored +insert into @results_database_schema.cohort_censor_stats (cohort_definition_id, lost_count) + select @target_cohort_id as cohort_definition_id, coalesce(FCC.total_people - TC.total, 0) as lost_count + FROM + (select count_big(distinct person_id) as total_people from #final_cohort) FCC, + (select count_big(distinct subject_id) as total from @target_database_schema.@target_cohort_table t where t.cohort_definition_id = @target_cohort_id) TC; +-- END: Censored Stats +} +{1 != 0 & 0 != 0}?{ + +CREATE TABLE #inclusion_rules (rule_sequence int); + +-- Find the event that is the 'best match' per person. +-- the 'best match' is defined as the event that satisfies the most inclusion rules. +-- ties are solved by choosing the event that matches the earliest inclusion rule, and then earliest. + +select q.person_id, q.event_id +into #best_events +from #qualified_events Q +join ( + SELECT R.person_id, R.event_id, ROW_NUMBER() OVER (PARTITION BY R.person_id ORDER BY R.rule_count DESC,R.min_rule_id ASC, R.start_date ASC) AS rank_value + FROM ( + SELECT Q.person_id, Q.event_id, COALESCE(COUNT(DISTINCT I.inclusion_rule_id), 0) AS rule_count, COALESCE(MIN(I.inclusion_rule_id), 0) AS min_rule_id, Q.start_date + FROM #qualified_events Q + LEFT JOIN #inclusion_events I ON q.person_id = i.person_id AND q.event_id = i.event_id + GROUP BY Q.person_id, Q.event_id, Q.start_date + ) R +) ranked on Q.person_id = ranked.person_id and Q.event_id = ranked.event_id +WHERE ranked.rank_value = 1 +; + +-- modes of generation: (the same tables store the results for the different modes, identified by the mode_id column) +-- 0: all events +-- 1: best event + + +-- BEGIN: Inclusion Impact Analysis - event +-- calculte matching group counts +delete from @results_database_schema.cohort_inclusion_result where cohort_definition_id = @target_cohort_id and mode_id = 0; +insert into @results_database_schema.cohort_inclusion_result (cohort_definition_id, inclusion_rule_mask, person_count, mode_id) +select @target_cohort_id as cohort_definition_id, inclusion_rule_mask, count_big(*) as person_count, 0 as mode_id +from +( + select Q.person_id, Q.event_id, CAST(SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) AS bigint) as inclusion_rule_mask + from #qualified_events Q + LEFT JOIN #inclusion_events I on q.person_id = i.person_id and q.event_id = i.event_id + GROUP BY Q.person_id, Q.event_id +) MG -- matching groups +group by inclusion_rule_mask +; + +-- calculate gain counts +delete from @results_database_schema.cohort_inclusion_stats where cohort_definition_id = @target_cohort_id and mode_id = 0; +insert into @results_database_schema.cohort_inclusion_stats (cohort_definition_id, rule_sequence, person_count, gain_count, person_total, mode_id) +select @target_cohort_id as cohort_definition_id, ir.rule_sequence, coalesce(T.person_count, 0) as person_count, coalesce(SR.person_count, 0) gain_count, EventTotal.total, 0 as mode_id +from #inclusion_rules ir +left join +( + select i.inclusion_rule_id, count_big(i.event_id) as person_count + from #qualified_events Q + JOIN #inclusion_events i on Q.person_id = I.person_id and Q.event_id = i.event_id + group by i.inclusion_rule_id +) T on ir.rule_sequence = T.inclusion_rule_id +CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal +CROSS JOIN (select count_big(event_id) as total from #qualified_events) EventTotal +LEFT JOIN @results_database_schema.cohort_inclusion_result SR on SR.mode_id = 0 AND SR.cohort_definition_id = @target_cohort_id AND (POWER(cast(2 as bigint),RuleTotal.total_rules) - POWER(cast(2 as bigint),ir.rule_sequence) - 1) = SR.inclusion_rule_mask -- POWER(2,rule count) - POWER(2,rule sequence) - 1 is the mask for 'all except this rule' +; + +-- calculate totals +delete from @results_database_schema.cohort_summary_stats where cohort_definition_id = @target_cohort_id and mode_id = 0; +insert into @results_database_schema.cohort_summary_stats (cohort_definition_id, base_count, final_count, mode_id) +select @target_cohort_id as cohort_definition_id, PC.total as person_count, coalesce(FC.total, 0) as final_count, 0 as mode_id +FROM +(select count_big(event_id) as total from #qualified_events) PC, +(select sum(sr.person_count) as total + from @results_database_schema.cohort_inclusion_result sr + CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal + where sr.mode_id = 0 and sr.cohort_definition_id = @target_cohort_id and sr.inclusion_rule_mask = POWER(cast(2 as bigint),RuleTotal.total_rules)-1 +) FC +; + +-- END: Inclusion Impact Analysis - event + +-- BEGIN: Inclusion Impact Analysis - person +-- calculte matching group counts +delete from @results_database_schema.cohort_inclusion_result where cohort_definition_id = @target_cohort_id and mode_id = 1; +insert into @results_database_schema.cohort_inclusion_result (cohort_definition_id, inclusion_rule_mask, person_count, mode_id) +select @target_cohort_id as cohort_definition_id, inclusion_rule_mask, count_big(*) as person_count, 1 as mode_id +from +( + select Q.person_id, Q.event_id, CAST(SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) AS bigint) as inclusion_rule_mask + from #best_events Q + LEFT JOIN #inclusion_events I on q.person_id = i.person_id and q.event_id = i.event_id + GROUP BY Q.person_id, Q.event_id +) MG -- matching groups +group by inclusion_rule_mask +; + +-- calculate gain counts +delete from @results_database_schema.cohort_inclusion_stats where cohort_definition_id = @target_cohort_id and mode_id = 1; +insert into @results_database_schema.cohort_inclusion_stats (cohort_definition_id, rule_sequence, person_count, gain_count, person_total, mode_id) +select @target_cohort_id as cohort_definition_id, ir.rule_sequence, coalesce(T.person_count, 0) as person_count, coalesce(SR.person_count, 0) gain_count, EventTotal.total, 1 as mode_id +from #inclusion_rules ir +left join +( + select i.inclusion_rule_id, count_big(i.event_id) as person_count + from #best_events Q + JOIN #inclusion_events i on Q.person_id = I.person_id and Q.event_id = i.event_id + group by i.inclusion_rule_id +) T on ir.rule_sequence = T.inclusion_rule_id +CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal +CROSS JOIN (select count_big(event_id) as total from #best_events) EventTotal +LEFT JOIN @results_database_schema.cohort_inclusion_result SR on SR.mode_id = 1 AND SR.cohort_definition_id = @target_cohort_id AND (POWER(cast(2 as bigint),RuleTotal.total_rules) - POWER(cast(2 as bigint),ir.rule_sequence) - 1) = SR.inclusion_rule_mask -- POWER(2,rule count) - POWER(2,rule sequence) - 1 is the mask for 'all except this rule' +; + +-- calculate totals +delete from @results_database_schema.cohort_summary_stats where cohort_definition_id = @target_cohort_id and mode_id = 1; +insert into @results_database_schema.cohort_summary_stats (cohort_definition_id, base_count, final_count, mode_id) +select @target_cohort_id as cohort_definition_id, PC.total as person_count, coalesce(FC.total, 0) as final_count, 1 as mode_id +FROM +(select count_big(event_id) as total from #best_events) PC, +(select sum(sr.person_count) as total + from @results_database_schema.cohort_inclusion_result sr + CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal + where sr.mode_id = 1 and sr.cohort_definition_id = @target_cohort_id and sr.inclusion_rule_mask = POWER(cast(2 as bigint),RuleTotal.total_rules)-1 +) FC +; + +-- END: Inclusion Impact Analysis - person + +TRUNCATE TABLE #best_events; +DROP TABLE #best_events; + +TRUNCATE TABLE #inclusion_rules; +DROP TABLE #inclusion_rules; +} + + + +TRUNCATE TABLE #cohort_rows; +DROP TABLE #cohort_rows; + +TRUNCATE TABLE #final_cohort; +DROP TABLE #final_cohort; + +TRUNCATE TABLE #inclusion_events; +DROP TABLE #inclusion_events; + +TRUNCATE TABLE #qualified_events; +DROP TABLE #qualified_events; + +TRUNCATE TABLE #included_events; +DROP TABLE #included_events; + +TRUNCATE TABLE #Codesets; +DROP TABLE #Codesets; diff --git a/inst/testdata/name/Cohorts.csv b/inst/testdata/name/Cohorts.csv index 0ec4360..0877cb3 100644 --- a/inst/testdata/name/Cohorts.csv +++ b/inst/testdata/name/Cohorts.csv @@ -1,4 +1,5 @@ cohort_id,cohort_name 1,celecoxib 2,celecoxibAge40 -3,celecoxibAge40Male \ No newline at end of file +3,celecoxibAge40Male +4,celecoxibCensored \ No newline at end of file diff --git a/inst/testdata/name/cohorts/celecoxibCensored.json b/inst/testdata/name/cohorts/celecoxibCensored.json new file mode 100644 index 0000000..5a56150 --- /dev/null +++ b/inst/testdata/name/cohorts/celecoxibCensored.json @@ -0,0 +1,58 @@ +{ + "ConceptSets": [ + { + "id": 0, + "name": "Celecoxib", + "expression": { + "items": [ + { + "concept": { + "CONCEPT_CLASS_ID": "Ingredient", + "CONCEPT_CODE": "140587", + "CONCEPT_ID": 1118084, + "CONCEPT_NAME": "celecoxib", + "DOMAIN_ID": "Drug", + "INVALID_REASON": "V", + "INVALID_REASON_CAPTION": "Valid", + "STANDARD_CONCEPT": "S", + "STANDARD_CONCEPT_CAPTION": "Standard", + "VOCABULARY_ID": "RxNorm" + } + } + ] + } + } + ], + "PrimaryCriteria": { + "CriteriaList": [ + { + "DrugEra": { + "CodesetId": 0 + } + } + ], + "ObservationWindow": { + "PriorDays": 0, + "PostDays": 0 + }, + "PrimaryCriteriaLimit": { + "Type": "First" + } + }, + "QualifiedLimit": { + "Type": "First" + }, + "ExpressionLimit": { + "Type": "First" + }, + "InclusionRules": [], + "CensoringCriteria": [], + "CollapseSettings": { + "CollapseType": "ERA", + "EraPad": 0 + }, + "CensorWindow": { + "StartDate": "2000-01-01" + }, + "cdmVersionRange": ">=5.0.0" +} \ No newline at end of file diff --git a/inst/testdata/name/sql/sql_server/celecoxibCensored.sql b/inst/testdata/name/sql/sql_server/celecoxibCensored.sql new file mode 100644 index 0000000..8bcc719 --- /dev/null +++ b/inst/testdata/name/sql/sql_server/celecoxibCensored.sql @@ -0,0 +1,284 @@ +CREATE TABLE #Codesets ( + codeset_id int NOT NULL, + concept_id bigint NOT NULL +) +; + +INSERT INTO #Codesets (codeset_id, concept_id) +SELECT 0 as codeset_id, c.concept_id FROM (select distinct I.concept_id FROM +( + select concept_id from @vocabulary_database_schema.CONCEPT where (concept_id in (1118084)) + +) I +) C; + +UPDATE STATISTICS #Codesets; + + +SELECT event_id, person_id, start_date, end_date, op_start_date, op_end_date, visit_occurrence_id +INTO #qualified_events +FROM +( + select pe.event_id, pe.person_id, pe.start_date, pe.end_date, pe.op_start_date, pe.op_end_date, row_number() over (partition by pe.person_id order by pe.start_date ASC) as ordinal, cast(pe.visit_occurrence_id as bigint) as visit_occurrence_id + FROM (-- Begin Primary Events +select P.ordinal as event_id, P.person_id, P.start_date, P.end_date, op_start_date, op_end_date, cast(P.visit_occurrence_id as bigint) as visit_occurrence_id +FROM +( + select E.person_id, E.start_date, E.end_date, + row_number() OVER (PARTITION BY E.person_id ORDER BY E.sort_date ASC, E.event_id) ordinal, + OP.observation_period_start_date as op_start_date, OP.observation_period_end_date as op_end_date, cast(E.visit_occurrence_id as bigint) as visit_occurrence_id + FROM + ( + -- Begin Drug Era Criteria +select C.person_id, C.drug_era_id as event_id, C.start_date, C.end_date, + CAST(NULL as bigint) as visit_occurrence_id,C.start_date as sort_date +from +( + select de.person_id,de.drug_era_id,de.drug_concept_id,de.drug_exposure_count,de.gap_days,de.drug_era_start_date as start_date, de.drug_era_end_date as end_date + FROM @cdm_database_schema.DRUG_ERA de +where de.drug_concept_id in (SELECT concept_id from #Codesets where codeset_id = 0) +) C + + +-- End Drug Era Criteria + + ) E + JOIN @cdm_database_schema.observation_period OP on E.person_id = OP.person_id and E.start_date >= OP.observation_period_start_date and E.start_date <= op.observation_period_end_date + WHERE DATEADD(day,0,OP.OBSERVATION_PERIOD_START_DATE) <= E.START_DATE AND DATEADD(day,0,E.START_DATE) <= OP.OBSERVATION_PERIOD_END_DATE +) P +WHERE P.ordinal = 1 +-- End Primary Events +) pe + +) QE + +; + +--- Inclusion Rule Inserts + +create table #inclusion_events (inclusion_rule_id bigint, + person_id bigint, + event_id bigint +); + +select event_id, person_id, start_date, end_date, op_start_date, op_end_date +into #included_events +FROM ( + SELECT event_id, person_id, start_date, end_date, op_start_date, op_end_date, row_number() over (partition by person_id order by start_date ASC) as ordinal + from + ( + select Q.event_id, Q.person_id, Q.start_date, Q.end_date, Q.op_start_date, Q.op_end_date, SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) as inclusion_rule_mask + from #qualified_events Q + LEFT JOIN #inclusion_events I on I.person_id = Q.person_id and I.event_id = Q.event_id + GROUP BY Q.event_id, Q.person_id, Q.start_date, Q.end_date, Q.op_start_date, Q.op_end_date + ) MG -- matching groups +{0 != 0}?{ + -- the matching group with all bits set ( POWER(2,# of inclusion rules) - 1 = inclusion_rule_mask + WHERE (MG.inclusion_rule_mask = POWER(cast(2 as bigint),0)-1) +} +) Results +WHERE Results.ordinal = 1 +; + + + +-- generate cohort periods into #final_cohort +select person_id, start_date, end_date +INTO #cohort_rows +from ( -- first_ends + select F.person_id, F.start_date, F.end_date + FROM ( + select I.event_id, I.person_id, I.start_date, CE.end_date, row_number() over (partition by I.person_id, I.event_id order by CE.end_date) as ordinal + from #included_events I + join ( -- cohort_ends +-- cohort exit dates +-- By default, cohort exit at the event's op end date +select event_id, person_id, op_end_date as end_date from #included_events + ) CE on I.event_id = CE.event_id and I.person_id = CE.person_id and CE.end_date >= I.start_date + ) F + WHERE F.ordinal = 1 +) FE; + + +select person_id, min(start_date) as start_date, DATEADD(day,-1 * 0, max(end_date)) as end_date +into #final_cohort +from ( + select person_id, start_date, end_date, sum(is_start) over (partition by person_id order by start_date, is_start desc rows unbounded preceding) group_idx + from ( + select person_id, start_date, end_date, + case when max(end_date) over (partition by person_id order by start_date rows between unbounded preceding and 1 preceding) >= start_date then 0 else 1 end is_start + from ( + select person_id, start_date, DATEADD(day,0,end_date) as end_date + from #cohort_rows + ) CR + ) ST +) GR +group by person_id, group_idx; + +DELETE FROM @target_database_schema.@target_cohort_table where cohort_definition_id = @target_cohort_id; +INSERT INTO @target_database_schema.@target_cohort_table (cohort_definition_id, subject_id, cohort_start_date, cohort_end_date) +select @target_cohort_id as cohort_definition_id, person_id, CASE WHEN start_date > DATEFROMPARTS(2000, 1, 1) THEN start_date ELSE DATEFROMPARTS(2000, 1, 1) END, end_date +FROM #final_cohort CO +WHERE CASE WHEN start_date > DATEFROMPARTS(2000, 1, 1) THEN start_date ELSE DATEFROMPARTS(2000, 1, 1) END <= end_date +; + +{1 != 0}?{ +-- BEGIN: Censored Stats + +delete from @results_database_schema.cohort_censor_stats where cohort_definition_id = @target_cohort_id; +-- calculate censored +insert into @results_database_schema.cohort_censor_stats (cohort_definition_id, lost_count) + select @target_cohort_id as cohort_definition_id, coalesce(FCC.total_people - TC.total, 0) as lost_count + FROM + (select count_big(distinct person_id) as total_people from #final_cohort) FCC, + (select count_big(distinct subject_id) as total from @target_database_schema.@target_cohort_table t where t.cohort_definition_id = @target_cohort_id) TC; +-- END: Censored Stats +} +{1 != 0 & 0 != 0}?{ + +CREATE TABLE #inclusion_rules (rule_sequence int); + +-- Find the event that is the 'best match' per person. +-- the 'best match' is defined as the event that satisfies the most inclusion rules. +-- ties are solved by choosing the event that matches the earliest inclusion rule, and then earliest. + +select q.person_id, q.event_id +into #best_events +from #qualified_events Q +join ( + SELECT R.person_id, R.event_id, ROW_NUMBER() OVER (PARTITION BY R.person_id ORDER BY R.rule_count DESC,R.min_rule_id ASC, R.start_date ASC) AS rank_value + FROM ( + SELECT Q.person_id, Q.event_id, COALESCE(COUNT(DISTINCT I.inclusion_rule_id), 0) AS rule_count, COALESCE(MIN(I.inclusion_rule_id), 0) AS min_rule_id, Q.start_date + FROM #qualified_events Q + LEFT JOIN #inclusion_events I ON q.person_id = i.person_id AND q.event_id = i.event_id + GROUP BY Q.person_id, Q.event_id, Q.start_date + ) R +) ranked on Q.person_id = ranked.person_id and Q.event_id = ranked.event_id +WHERE ranked.rank_value = 1 +; + +-- modes of generation: (the same tables store the results for the different modes, identified by the mode_id column) +-- 0: all events +-- 1: best event + + +-- BEGIN: Inclusion Impact Analysis - event +-- calculte matching group counts +delete from @results_database_schema.cohort_inclusion_result where cohort_definition_id = @target_cohort_id and mode_id = 0; +insert into @results_database_schema.cohort_inclusion_result (cohort_definition_id, inclusion_rule_mask, person_count, mode_id) +select @target_cohort_id as cohort_definition_id, inclusion_rule_mask, count_big(*) as person_count, 0 as mode_id +from +( + select Q.person_id, Q.event_id, CAST(SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) AS bigint) as inclusion_rule_mask + from #qualified_events Q + LEFT JOIN #inclusion_events I on q.person_id = i.person_id and q.event_id = i.event_id + GROUP BY Q.person_id, Q.event_id +) MG -- matching groups +group by inclusion_rule_mask +; + +-- calculate gain counts +delete from @results_database_schema.cohort_inclusion_stats where cohort_definition_id = @target_cohort_id and mode_id = 0; +insert into @results_database_schema.cohort_inclusion_stats (cohort_definition_id, rule_sequence, person_count, gain_count, person_total, mode_id) +select @target_cohort_id as cohort_definition_id, ir.rule_sequence, coalesce(T.person_count, 0) as person_count, coalesce(SR.person_count, 0) gain_count, EventTotal.total, 0 as mode_id +from #inclusion_rules ir +left join +( + select i.inclusion_rule_id, count_big(i.event_id) as person_count + from #qualified_events Q + JOIN #inclusion_events i on Q.person_id = I.person_id and Q.event_id = i.event_id + group by i.inclusion_rule_id +) T on ir.rule_sequence = T.inclusion_rule_id +CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal +CROSS JOIN (select count_big(event_id) as total from #qualified_events) EventTotal +LEFT JOIN @results_database_schema.cohort_inclusion_result SR on SR.mode_id = 0 AND SR.cohort_definition_id = @target_cohort_id AND (POWER(cast(2 as bigint),RuleTotal.total_rules) - POWER(cast(2 as bigint),ir.rule_sequence) - 1) = SR.inclusion_rule_mask -- POWER(2,rule count) - POWER(2,rule sequence) - 1 is the mask for 'all except this rule' +; + +-- calculate totals +delete from @results_database_schema.cohort_summary_stats where cohort_definition_id = @target_cohort_id and mode_id = 0; +insert into @results_database_schema.cohort_summary_stats (cohort_definition_id, base_count, final_count, mode_id) +select @target_cohort_id as cohort_definition_id, PC.total as person_count, coalesce(FC.total, 0) as final_count, 0 as mode_id +FROM +(select count_big(event_id) as total from #qualified_events) PC, +(select sum(sr.person_count) as total + from @results_database_schema.cohort_inclusion_result sr + CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal + where sr.mode_id = 0 and sr.cohort_definition_id = @target_cohort_id and sr.inclusion_rule_mask = POWER(cast(2 as bigint),RuleTotal.total_rules)-1 +) FC +; + +-- END: Inclusion Impact Analysis - event + +-- BEGIN: Inclusion Impact Analysis - person +-- calculte matching group counts +delete from @results_database_schema.cohort_inclusion_result where cohort_definition_id = @target_cohort_id and mode_id = 1; +insert into @results_database_schema.cohort_inclusion_result (cohort_definition_id, inclusion_rule_mask, person_count, mode_id) +select @target_cohort_id as cohort_definition_id, inclusion_rule_mask, count_big(*) as person_count, 1 as mode_id +from +( + select Q.person_id, Q.event_id, CAST(SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) AS bigint) as inclusion_rule_mask + from #best_events Q + LEFT JOIN #inclusion_events I on q.person_id = i.person_id and q.event_id = i.event_id + GROUP BY Q.person_id, Q.event_id +) MG -- matching groups +group by inclusion_rule_mask +; + +-- calculate gain counts +delete from @results_database_schema.cohort_inclusion_stats where cohort_definition_id = @target_cohort_id and mode_id = 1; +insert into @results_database_schema.cohort_inclusion_stats (cohort_definition_id, rule_sequence, person_count, gain_count, person_total, mode_id) +select @target_cohort_id as cohort_definition_id, ir.rule_sequence, coalesce(T.person_count, 0) as person_count, coalesce(SR.person_count, 0) gain_count, EventTotal.total, 1 as mode_id +from #inclusion_rules ir +left join +( + select i.inclusion_rule_id, count_big(i.event_id) as person_count + from #best_events Q + JOIN #inclusion_events i on Q.person_id = I.person_id and Q.event_id = i.event_id + group by i.inclusion_rule_id +) T on ir.rule_sequence = T.inclusion_rule_id +CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal +CROSS JOIN (select count_big(event_id) as total from #best_events) EventTotal +LEFT JOIN @results_database_schema.cohort_inclusion_result SR on SR.mode_id = 1 AND SR.cohort_definition_id = @target_cohort_id AND (POWER(cast(2 as bigint),RuleTotal.total_rules) - POWER(cast(2 as bigint),ir.rule_sequence) - 1) = SR.inclusion_rule_mask -- POWER(2,rule count) - POWER(2,rule sequence) - 1 is the mask for 'all except this rule' +; + +-- calculate totals +delete from @results_database_schema.cohort_summary_stats where cohort_definition_id = @target_cohort_id and mode_id = 1; +insert into @results_database_schema.cohort_summary_stats (cohort_definition_id, base_count, final_count, mode_id) +select @target_cohort_id as cohort_definition_id, PC.total as person_count, coalesce(FC.total, 0) as final_count, 1 as mode_id +FROM +(select count_big(event_id) as total from #best_events) PC, +(select sum(sr.person_count) as total + from @results_database_schema.cohort_inclusion_result sr + CROSS JOIN (select count(*) as total_rules from #inclusion_rules) RuleTotal + where sr.mode_id = 1 and sr.cohort_definition_id = @target_cohort_id and sr.inclusion_rule_mask = POWER(cast(2 as bigint),RuleTotal.total_rules)-1 +) FC +; + +-- END: Inclusion Impact Analysis - person + +TRUNCATE TABLE #best_events; +DROP TABLE #best_events; + +TRUNCATE TABLE #inclusion_rules; +DROP TABLE #inclusion_rules; +} + + + +TRUNCATE TABLE #cohort_rows; +DROP TABLE #cohort_rows; + +TRUNCATE TABLE #final_cohort; +DROP TABLE #final_cohort; + +TRUNCATE TABLE #inclusion_events; +DROP TABLE #inclusion_events; + +TRUNCATE TABLE #qualified_events; +DROP TABLE #qualified_events; + +TRUNCATE TABLE #included_events; +DROP TABLE #included_events; + +TRUNCATE TABLE #Codesets; +DROP TABLE #Codesets; diff --git a/man/DemographicSubsetOperator.Rd b/man/DemographicSubsetOperator.Rd index fea8116..53078dc 100644 --- a/man/DemographicSubsetOperator.Rd +++ b/man/DemographicSubsetOperator.Rd @@ -2,15 +2,13 @@ % Please edit documentation in R/Subsets.R \name{DemographicSubsetOperator} \alias{DemographicSubsetOperator} -\title{Criteria Subset} +\title{Demographic Subset Operator} \value{ char vector Get auto generated name } \description{ -Criteria Subset - -Criteria Subset +Operators for subsetting a cohort by demographic criteria } \section{Super class}{ \code{\link[CohortGenerator:SubsetOperator]{CohortGenerator::SubsetOperator}} -> \code{DemographicSubsetOperator} diff --git a/man/SubsetCohortWindow.Rd b/man/SubsetCohortWindow.Rd index 9b274eb..c0a1e13 100644 --- a/man/SubsetCohortWindow.Rd +++ b/man/SubsetCohortWindow.Rd @@ -2,13 +2,10 @@ % Please edit documentation in R/Subsets.R \name{SubsetCohortWindow} \alias{SubsetCohortWindow} -\title{to List} +\title{Time Window For Cohort Subset Operator} \description{ Representation of a time window to use when subsetting a target cohort with a subset cohort } -\details{ -SubsetCohortWindow settings -} \section{Active bindings}{ \if{html}{\out{
}} \describe{ diff --git a/man/SubsetOperator.Rd b/man/SubsetOperator.Rd index 640fa7a..44f7444 100644 --- a/man/SubsetOperator.Rd +++ b/man/SubsetOperator.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/Subsets.R \name{SubsetOperator} \alias{SubsetOperator} -\title{SubsetOperator} +\title{Abstract base class for subsets.} \description{ Abstract Base Class for subsets. Subsets should inherit from this and implement their own requirements. } diff --git a/man/createCohortSubset.Rd b/man/createCohortSubset.Rd index 7c3730f..d2250e6 100644 --- a/man/createCohortSubset.Rd +++ b/man/createCohortSubset.Rd @@ -21,7 +21,7 @@ createCohortSubset( \item{cohortCombinationOperator}{"any" or "all" if using more than one cohort id allow a subject to be in any cohort or require that they are in all cohorts in specified windows} -\item{negate}{The opposite of this definition - include patients who do NOT meet the specified criteria (NOT YET IMPLEMENTED)} +\item{negate}{The opposite of this definition - include patients who do NOT meet the specified criteria} \item{startWindow}{A SubsetCohortWindow that patients must fall inside (see createSubsetCohortWindow)} diff --git a/man/createResultsDataModel.Rd b/man/createResultsDataModel.Rd new file mode 100644 index 0000000..105eb23 --- /dev/null +++ b/man/createResultsDataModel.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{createResultsDataModel} +\alias{createResultsDataModel} +\title{Create the results data model tables on a database server.} +\usage{ +createResultsDataModel( + connectionDetails = NULL, + databaseSchema, + tablePrefix = "" +) +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails]} + +\item{databaseSchema}{The schema on the server where the tables will be created.} + +\item{tablePrefix}{(Optional) string to insert before table names for database table names} +} +\description{ +Create the results data model tables on a database server. +} +\details{ +Only PostgreSQL and SQLite servers are supported. +} diff --git a/man/exportCohortStatsTables.Rd b/man/exportCohortStatsTables.Rd index e4531fe..9791697 100644 --- a/man/exportCohortStatsTables.Rd +++ b/man/exportCohortStatsTables.Rd @@ -13,7 +13,9 @@ exportCohortStatsTables( snakeCaseToCamelCase = TRUE, fileNamesInSnakeCase = FALSE, incremental = FALSE, - databaseId = NULL + databaseId = NULL, + cohortDefinitionSet = NULL, + tablePrefix = "" ) } \arguments{ @@ -48,9 +50,27 @@ overwriting an existing results} \item{databaseId}{Optional - when specified, the databaseId will be added to the exported results} + +\item{cohortDefinitionSet}{The \code{cohortDefinitionSet} argument must be a data frame with +the following columns: \describe{ +\item{cohortId}{The unique integer identifier of the cohort} +\item{cohortName}{The cohort's name} +\item{sql}{The OHDSI-SQL used to generate the cohort}} +Optionally, this data frame may contain: \describe{ +\item{json}{The Circe JSON representation of the cohort}}} + +\item{tablePrefix}{Optional - allows to append a prefix to the exported +file names.} } \description{ This function retrieves the data from the cohort statistics tables and writes them to the inclusion statistics folder specified in the function -call. +call. NOTE: inclusion rule names are handled in one of two ways: + +1. You can specify the cohortDefinitionSet parameter and the inclusion rule +names will be extracted from the data.frame. +2. You can insert the inclusion rule names into the database using the +insertInclusionRuleNames function of this package. + +The first approach is preferred as to avoid the warning emitted. } diff --git a/man/getCohortInclusionRules.Rd b/man/getCohortInclusionRules.Rd new file mode 100644 index 0000000..cfb9589 --- /dev/null +++ b/man/getCohortInclusionRules.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CohortStats.R +\name{getCohortInclusionRules} +\alias{getCohortInclusionRules} +\title{Get Cohort Inclusion Rules from a cohort definition set} +\usage{ +getCohortInclusionRules(cohortDefinitionSet) +} +\arguments{ +\item{cohortDefinitionSet}{The \code{cohortDefinitionSet} argument must be a data frame with +the following columns: \describe{ +\item{cohortId}{The unique integer identifier of the cohort} +\item{cohortName}{The cohort's name} +\item{sql}{The OHDSI-SQL used to generate the cohort}} +Optionally, this data frame may contain: \describe{ +\item{json}{The Circe JSON representation of the cohort}}} +} +\description{ +This function returns a data frame of the inclusion rules defined +in a cohort definition set. +} diff --git a/man/getDataMigrator.Rd b/man/getDataMigrator.Rd new file mode 100644 index 0000000..dfd2758 --- /dev/null +++ b/man/getDataMigrator.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{getDataMigrator} +\alias{getDataMigrator} +\title{Get database migrations instance} +\usage{ +getDataMigrator(connectionDetails, databaseSchema, tablePrefix = "") +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connection details object} + +\item{databaseSchema}{String schema where database schema lives} + +\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cg_")} +} +\value{ +Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models +} +\description{ +Returns ResultModelManager DataMigrationsManager instance. +} diff --git a/man/getResultsDataModelSpecifications.Rd b/man/getResultsDataModelSpecifications.Rd new file mode 100644 index 0000000..b528db2 --- /dev/null +++ b/man/getResultsDataModelSpecifications.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{getResultsDataModelSpecifications} +\alias{getResultsDataModelSpecifications} +\title{Get specifications for CohortGenerator results data model} +\usage{ +getResultsDataModelSpecifications() +} +\value{ +A tibble data frame object with specifications +} +\description{ +Get specifications for CohortGenerator results data model +} diff --git a/man/migrateDataModel.Rd b/man/migrateDataModel.Rd new file mode 100644 index 0000000..a23acdf --- /dev/null +++ b/man/migrateDataModel.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{migrateDataModel} +\alias{migrateDataModel} +\title{Migrate Data model} +\usage{ +migrateDataModel(connectionDetails, databaseSchema, tablePrefix = "") +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connection details object} + +\item{databaseSchema}{String schema where database schema lives} + +\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cg_")} +} +\description{ +Migrate data from current state to next state + +It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +are using a postgres backend) or have kept the csv/zip files from your data generation. +} diff --git a/man/readCsv.Rd b/man/readCsv.Rd index 8497b54..9eac7ea 100644 --- a/man/readCsv.Rd +++ b/man/readCsv.Rd @@ -4,13 +4,43 @@ \alias{readCsv} \title{Used to read a .csv file} \usage{ -readCsv(file, warnOnCaseMismatch = TRUE) +readCsv(file, warnOnCaseMismatch = TRUE, colTypes = readr::cols()) } \arguments{ \item{file}{The .csv file to read.} \item{warnOnCaseMismatch}{When TRUE, raise a warning if column headings in the .csv are not in snake_case format} + +\item{colTypes}{Corresponds to the `col_types` in the `readr::read_csv` function. + One of `NULL`, a [readr::cols()] specification, or + a string. See `vignette("readr")` for more details. + + If `NULL`, all column types will be inferred from `guess_max` rows of the + input, interspersed throughout the file. This is convenient (and fast), + but not robust. If the guessed types are wrong, you'll need to increase + `guess_max` or supply the correct types yourself. + + Column specifications created by [list()] or [cols()] must contain + one column specification for each column. + + Alternatively, you can use a compact string representation where each + character represents one column: + - c = character + - i = integer + - n = number + - d = double + - l = logical + - f = factor + - D = date + - T = date time + - t = time + - ? = guess + - _ or - = skip + + By default, reading a file without a column specification will print a + message showing what `readr` guessed they were. To remove this message, + set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`.} } \value{ A tibble with the .csv contents diff --git a/man/runCohortGeneration.Rd b/man/runCohortGeneration.Rd new file mode 100644 index 0000000..7781f28 --- /dev/null +++ b/man/runCohortGeneration.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RunCohortGeneration.R +\name{runCohortGeneration} +\alias{runCohortGeneration} +\title{Run a cohort generation and export results} +\usage{ +runCohortGeneration( + connectionDetails, + cdmDatabaseSchema, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + cohortDatabaseSchema = cdmDatabaseSchema, + cohortTableNames = getCohortTableNames(), + cohortDefinitionSet = NULL, + negativeControlOutcomeCohortSet = NULL, + occurrenceType = "all", + detectOnDescendants = FALSE, + stopOnError = TRUE, + outputFolder, + databaseId = 1, + incremental = FALSE, + incrementalFolder = NULL +) +} +\arguments{ +\item{connectionDetails}{An object of type \code{connectionDetails} as created using the +\code{\link[DatabaseConnector]{createConnectionDetails}} function in the +DatabaseConnector package.} + +\item{cdmDatabaseSchema}{Schema name where your patient-level data in OMOP CDM format resides. +Note that for SQL Server, this should include both the database and +schema name, for example 'cdm_data.dbo'.} + +\item{tempEmulationSchema}{Some database platforms like Oracle and Impala do not truly support +temp tables. To emulate temp tables, provide a schema with write +privileges where temp tables can be created.} + +\item{cohortDatabaseSchema}{Schema name where your cohort tables reside. Note that for SQL Server, +this should include both the database and schema name, for example +'scratch.dbo'.} + +\item{cohortTableNames}{The names of the cohort tables. See \code{\link{getCohortTableNames}} +for more details.} + +\item{cohortDefinitionSet}{The \code{cohortDefinitionSet} argument must be a data frame with +the following columns: \describe{ +\item{cohortId}{The unique integer identifier of the cohort} +\item{cohortName}{The cohort's name} +\item{sql}{The OHDSI-SQL used to generate the cohort}} +Optionally, this data frame may contain: \describe{ +\item{json}{The Circe JSON representation of the cohort}}} + +\item{negativeControlOutcomeCohortSet}{The \code{negativeControlOutcomeCohortSet} argument must be a data frame with +the following columns: \describe{ +\item{cohortId}{The unique integer identifier of the cohort} +\item{cohortName}{The cohort's name} +\item{outcomeConceptId}{The concept_id in the condition domain to use for the negative control outcome.}}} + +\item{occurrenceType}{For negative controls outcomes, the occurrenceType +will detect either: the first time an +outcomeConceptId occurs or all times the +outcomeConceptId occurs for a person. Values +accepted: 'all' or 'first'.} + +\item{detectOnDescendants}{For negative controls outcomes, when set to TRUE, +detectOnDescendants will use the vocabulary to +find negative control outcomes using the +outcomeConceptId and all descendants via the +concept_ancestor table. When FALSE, only the exact +outcomeConceptId will be used to detect the +outcome.} + +\item{stopOnError}{If an error happens while generating one of the +cohorts in the cohortDefinitionSet, should we +stop processing the other cohorts? The default is +TRUE; when set to FALSE, failures will be +identified in the return value from this function.} + +\item{outputFolder}{Name of the folder where all the outputs will written to.} + +\item{databaseId}{A unique ID for the database. This will be appended to +most tables.} + +\item{incremental}{Create only cohorts that haven't been created before?} + +\item{incrementalFolder}{If \code{incremental = TRUE}, specify a folder where +records are kept of which definition has been +executed.} +} +\description{ +Run a cohort generation and export results +} +\details{ +Run a cohort generation for a set of cohorts and negative control outcomes. +This function will also export the results of the run to the `outputFolder`. +} diff --git a/man/saveCohortSubsetDefinition.Rd b/man/saveCohortSubsetDefinition.Rd index 855f25d..ce8d487 100644 --- a/man/saveCohortSubsetDefinition.Rd +++ b/man/saveCohortSubsetDefinition.Rd @@ -10,7 +10,7 @@ saveCohortSubsetDefinition( ) } \arguments{ -\item{subsetDefinition}{The subset definition object {@seealso CohortSubsetDefinition}} +\item{subsetDefinition}{The subset definition object @seealso[CohortSubsetDefinition]} \item{subsetJsonFolder}{Defines the folder to store the subset JSON} } diff --git a/man/uploadResults.Rd b/man/uploadResults.Rd new file mode 100644 index 0000000..06700b2 --- /dev/null +++ b/man/uploadResults.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{uploadResults} +\alias{uploadResults} +\title{Upload results to the database server.} +\usage{ +uploadResults( + connectionDetails, + schema, + resultsFolder, + forceOverWriteOfSpecifications = FALSE, + purgeSiteDataBeforeUploading = TRUE, + tablePrefix = "", + ... +) +} +\arguments{ +\item{connectionDetails}{An object of type \code{connectionDetails} as created using the +\code{\link[DatabaseConnector]{createConnectionDetails}} function in the +DatabaseConnector package.} + +\item{schema}{The schema on the server where the tables have been created.} + +\item{resultsFolder}{The folder holding the results in .csv files} + +\item{forceOverWriteOfSpecifications}{If TRUE, specifications of the phenotypes, cohort definitions, and analysis +will be overwritten if they already exist on the database. Only use this if these specifications +have changed since the last upload.} + +\item{purgeSiteDataBeforeUploading}{If TRUE, before inserting data for a specific databaseId all the data for +that site will be dropped. This assumes the resultsFolder file contains the full data for that +data site.} + +\item{tablePrefix}{(Optional) string to insert before table names for database table names} + +\item{...}{See ResultModelManager::uploadResults} +} +\description{ +Requires the results data model tables have been created using the \code{\link{createResultsDataModel}} function. +} diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 122d3b9..1cda5b4 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -84,49 +84,104 @@ getPlatformConnectionDetails <- function(dbmsPlatform) { options("sqlRenderTempEmulationSchema" = NULL) cohortTable <- "cohort" } else { - if (dbmsPlatform == "postgresql") { - dbUser <- Sys.getenv("CDM5_POSTGRESQL_USER") - dbPassword <- Sys.getenv("CDM5_POSTGRESQL_PASSWORD") - dbServer <- Sys.getenv("CDM5_POSTGRESQL_SERVER") - cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") - vocabularyDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") - cohortDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA") + if (dbmsPlatform == "bigquery") { + # To avoid rate limit on BigQuery, only test on 1 OS: + if (.Platform$OS.type == "windows") { + bqKeyFile <- tempfile(fileext = ".json") + writeLines(Sys.getenv("CDM_BIG_QUERY_KEY_FILE"), bqKeyFile) + if (testthat::is_testing()) { + withr::defer(unlink(bqKeyFile, force = TRUE), testthat::teardown_env()) + } + bqConnectionString <- gsub( + "", + normalizePath(bqKeyFile, winslash = "/"), + Sys.getenv("CDM_BIG_QUERY_CONNECTION_STRING") + ) + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = dbmsPlatform, + user = "", + password = "", + connectionString = !!bqConnectionString, + pathToDriver = jdbcDriverFolder + ) + cdmDatabaseSchema <- Sys.getenv("CDM_BIG_QUERY_CDM_SCHEMA") + vocabularyDatabaseSchema <- Sys.getenv("CDM_BIG_QUERY_CDM_SCHEMA") + cohortDatabaseSchema <- Sys.getenv("CDM_BIG_QUERY_OHDSI_SCHEMA") + options(sqlRenderTempEmulationSchema = Sys.getenv("CDM_BIG_QUERY_OHDSI_SCHEMA")) + } else { + return(NULL) + } } else if (dbmsPlatform == "oracle") { - dbUser <- Sys.getenv("CDM5_ORACLE_USER") - dbPassword <- Sys.getenv("CDM5_ORACLE_PASSWORD") - dbServer <- Sys.getenv("CDM5_ORACLE_SERVER") + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = dbmsPlatform, + user = Sys.getenv("CDM5_ORACLE_USER"), + password = URLdecode(Sys.getenv("CDM5_ORACLE_PASSWORD")), + server = Sys.getenv("CDM5_ORACLE_SERVER"), + pathToDriver = jdbcDriverFolder + ) cdmDatabaseSchema <- Sys.getenv("CDM5_ORACLE_CDM_SCHEMA") vocabularyDatabaseSchema <- Sys.getenv("CDM5_ORACLE_CDM_SCHEMA") cohortDatabaseSchema <- Sys.getenv("CDM5_ORACLE_OHDSI_SCHEMA") options(sqlRenderTempEmulationSchema = Sys.getenv("CDM5_ORACLE_OHDSI_SCHEMA")) + } else if (dbmsPlatform == "postgresql") { + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = dbmsPlatform, + user = Sys.getenv("CDM5_POSTGRESQL_USER"), + password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")), + server = Sys.getenv("CDM5_POSTGRESQL_SERVER"), + pathToDriver = jdbcDriverFolder + ) + cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") + vocabularyDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") + cohortDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA") } else if (dbmsPlatform == "redshift") { - dbUser <- Sys.getenv("CDM5_REDSHIFT_USER") - dbPassword <- Sys.getenv("CDM5_REDSHIFT_PASSWORD") - dbServer <- Sys.getenv("CDM5_REDSHIFT_SERVER") + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = dbmsPlatform, + user = Sys.getenv("CDM5_REDSHIFT_USER"), + password = URLdecode(Sys.getenv("CDM5_REDSHIFT_PASSWORD")), + server = Sys.getenv("CDM5_REDSHIFT_SERVER"), + pathToDriver = jdbcDriverFolder + ) cdmDatabaseSchema <- Sys.getenv("CDM5_REDSHIFT_CDM_SCHEMA") vocabularyDatabaseSchema <- Sys.getenv("CDM5_REDSHIFT_CDM_SCHEMA") cohortDatabaseSchema <- Sys.getenv("CDM5_REDSHIFT_OHDSI_SCHEMA") + } else if (dbmsPlatform == "snowflake") { + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = dbmsPlatform, + user = Sys.getenv("CDM_SNOWFLAKE_USER"), + password = URLdecode(Sys.getenv("CDM_SNOWFLAKE_PASSWORD")), + connectionString = Sys.getenv("CDM_SNOWFLAKE_CONNECTION_STRING"), + pathToDriver = jdbcDriverFolder + ) + cdmDatabaseSchema <- Sys.getenv("CDM_SNOWFLAKE_CDM53_SCHEMA") + vocabularyDatabaseSchema <- Sys.getenv("CDM_SNOWFLAKE_CDM53_SCHEMA") + cohortDatabaseSchema <- Sys.getenv("CDM_SNOWFLAKE_OHDSI_SCHEMA") + options(sqlRenderTempEmulationSchema = Sys.getenv("CDM_SNOWFLAKE_OHDSI_SCHEMA")) + } else if (dbmsPlatform == "spark") { + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = dbmsPlatform, + user = Sys.getenv("CDM5_SPARK_USER"), + password = URLdecode(Sys.getenv("CDM5_SPARK_PASSWORD")), + connectionString = Sys.getenv("CDM5_SPARK_CONNECTION_STRING"), + pathToDriver = jdbcDriverFolder + ) + cdmDatabaseSchema <- Sys.getenv("CDM5_SPARK_CDM_SCHEMA") + vocabularyDatabaseSchema <- Sys.getenv("CDM5_SPARK_CDM_SCHEMA") + cohortDatabaseSchema <- Sys.getenv("CDM5_SPARK_OHDSI_SCHEMA") + options(sqlRenderTempEmulationSchema = Sys.getenv("CDM5_SPARK_OHDSI_SCHEMA")) } else if (dbmsPlatform == "sql server") { - dbUser <- Sys.getenv("CDM5_SQL_SERVER_USER") - dbPassword <- Sys.getenv("CDM5_SQL_SERVER_PASSWORD") - dbServer <- Sys.getenv("CDM5_SQL_SERVER_SERVER") + connectionDetails <- createConnectionDetails( + dbms = dbmsPlatform, + user = Sys.getenv("CDM5_SQL_SERVER_USER"), + password = URLdecode(Sys.getenv("CDM5_SQL_SERVER_PASSWORD")), + server = Sys.getenv("CDM5_SQL_SERVER_SERVER"), + pathToDriver = jdbcDriverFolder + ) cdmDatabaseSchema <- Sys.getenv("CDM5_SQL_SERVER_CDM_SCHEMA") vocabularyDatabaseSchema <- Sys.getenv("CDM5_SQL_SERVER_CDM_SCHEMA") cohortDatabaseSchema <- Sys.getenv("CDM5_SQL_SERVER_OHDSI_SCHEMA") } - if (dbServer == "") { - return(NULL) - } - - connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = dbmsPlatform, - user = dbUser, - password = URLdecode(dbPassword), - server = dbServer, - pathToDriver = jdbcDriverFolder - ) - # Add drivers DatabaseConnector::downloadJdbcDrivers(dbmsPlatform, pathToDriver = jdbcDriverFolder) # Table created to avoid collisions diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 8ec0077..1974b23 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -1,4 +1,4 @@ -dbmsPlatforms <- c("redshift", "postgresql", "oracle", "sql server") +dbmsPlatforms <- c("bigquery", "oracle", "postgresql", "redshift", "snowflake", "spark", "sql server") connectionDetails <- Eunomia::getEunomiaConnectionDetails() outputFolder <- tempfile() dir.create(outputFolder) diff --git a/tests/testthat/test-CohortConstructionAndStats.R b/tests/testthat/test-CohortConstructionAndStats.R index 18956b4..d5e1847 100644 --- a/tests/testthat/test-CohortConstructionAndStats.R +++ b/tests/testthat/test-CohortConstructionAndStats.R @@ -55,6 +55,23 @@ test_that("Call instatiateCohortSet with malformed cohortDefinitionSet parameter ) }) +test_that("Call instatiateCohortSet with cohortDefinitionSet with non-integer data type", { + cohortDefinitionSet <- createEmptyCohortDefinitionSet() + cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame( + cohortId = 1.2, + cohortName = "Test", + sql = "sql", + foo = "foo" + )) + expect_error( + generateCohortSet( + connectionDetails = connectionDetails, + cohortDefinitionSet = cohortDefinitionSet + ), + message = "(included non-integer)" + ) +}) + test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns", { cohortDefinitionSet <- createEmptyCohortDefinitionSet() cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame( @@ -66,7 +83,7 @@ test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns" expect_error( generateCohortSet( connectionDetails = connectionDetails, - cohortDefinitionSet = data.frame() + cohortDefinitionSet = cohortDefinitionSet ), message = "(must contain the following columns)" ) @@ -322,7 +339,7 @@ test_that("Create cohorts with stopOnError = FALSE and incremental = TRUE", { ) expect_equal(nrow(cohortsGenerated), nrow(cohortsWithoutStats)) expect_equal(nrow(cohortsGenerated[cohortsGenerated$generationStatus == "FAILED", ]), 1) - expect_equal(nrow(cohortsGenerated[cohortsGenerated$generationStatus == "COMPLETE", ]), 3) + expect_equal(nrow(cohortsGenerated[cohortsGenerated$generationStatus == "COMPLETE", ]), 4) # Now update the cohort that was failing to use a SQL statement that will work sqlThatWillWork <- " @@ -344,7 +361,7 @@ test_that("Create cohorts with stopOnError = FALSE and incremental = TRUE", { ) expect_equal(nrow(cohortsGenerated), nrow(cohortsWithoutStats)) expect_equal(nrow(cohortsGenerated[cohortsGenerated$generationStatus == "COMPLETE", ]), 1) - expect_equal(nrow(cohortsGenerated[cohortsGenerated$generationStatus == "SKIPPED", ]), 3) + expect_equal(nrow(cohortsGenerated[cohortsGenerated$generationStatus == "SKIPPED", ]), 4) unlink(recordKeepingFolder, recursive = TRUE) if (file.exists("errorReportSql.txt")) { unlink("errorReportSql.txt") diff --git a/tests/testthat/test-CohortCount.R b/tests/testthat/test-CohortCount.R index c796a63..ec562bb 100644 --- a/tests/testthat/test-CohortCount.R +++ b/tests/testthat/test-CohortCount.R @@ -142,7 +142,7 @@ test_that("Call getCohortCounts with no cohortId specified and cohortDefinitionS cohortDefinitionSet = cohortDefinitionSet ) - expect_true(nrow(testCohortCounts) == 4) + expect_true(nrow(testCohortCounts) == 5) expect_true(testCohortCounts[testCohortCounts$cohortId == 100, "cohortEntries"] == 0) expect_true(testCohortCounts[testCohortCounts$cohortId == 100, "cohortSubjects"] == 0) }) diff --git a/tests/testthat/test-CohortDefinitionSet.R b/tests/testthat/test-CohortDefinitionSet.R index 90cb9bd..6329d70 100644 --- a/tests/testthat/test-CohortDefinitionSet.R +++ b/tests/testthat/test-CohortDefinitionSet.R @@ -64,7 +64,7 @@ test_that("Call getCohortDefinitionSet with settingsFile in CohortGenerator pack packageName = "CohortGenerator", verbose = TRUE ) - expect_equal(nrow(cohortDefinitionSet), 3) + expect_equal(nrow(cohortDefinitionSet), 4) }) test_that("Call getCohortDefinitionSet with settingsFile in CohortGenerator package where json/sql use the cohort id", { @@ -74,7 +74,7 @@ test_that("Call getCohortDefinitionSet with settingsFile in CohortGenerator pack sqlFolder = "testdata/id/sql/sql_server", packageName = "CohortGenerator" ) - expect_equal(nrow(cohortDefinitionSet), 3) + expect_equal(nrow(cohortDefinitionSet), 4) }) @@ -179,7 +179,7 @@ test_that("Call saveCohortDefinitionSet with missing json", { )) } - expect_output( + expect_message( saveCohortDefinitionSet( cohortDefinitionSet = cohortsToCreate, settingsFileName = file.path(tempdir(), "settings"), @@ -230,6 +230,12 @@ test_that("Call isCohortDefinitionSet with incorrect cohort definition set and e expect_warning(expect_false(isCohortDefinitionSet(cohortDefinitionSetError))) }) +test_that("Call isCohortDefinitionSet with cohort definition set with integer data type for cohort ID and expect TRUE", { + cohortDefinitionSet <- createEmptyCohortDefinitionSet() + cohortDefinitionSet$cohortId <- as.integer(cohortDefinitionSet$cohortId) + expect_true(isCohortDefinitionSet(cohortDefinitionSet)) +}) + test_that("Call isCohortDefinitionSet with cohort definition set with incorrect data type and expect FALSE", { cohortDefinitionSet <- createEmptyCohortDefinitionSet() cohortDefinitionSet$cohortName <- as.integer(cohortDefinitionSet$cohortName) diff --git a/tests/testthat/test-Export.R b/tests/testthat/test-Export.R index bf1eaa4..87f535d 100644 --- a/tests/testthat/test-Export.R +++ b/tests/testthat/test-Export.R @@ -57,13 +57,16 @@ test_that("Export cohort stats with permanent tables", { ) checkmate::expect_names(names(cohortStats), subset.of = c("cohortInclusionStatsTable")) - # Export the results - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - incremental = FALSE + + expect_warning( + # Export the results + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + incremental = FALSE + ) ) # Verify the files are written to the file system @@ -102,13 +105,15 @@ test_that("Export cohort stats with databaseId", { ) # Export the results - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - incremental = FALSE, - databaseId = "Eunomia" + expect_warning( + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + incremental = FALSE, + databaseId = "Eunomia" + ) ) # Verify the files are written to the file system and have the database_id @@ -146,15 +151,17 @@ test_that("Export cohort stats with fileNamesInSnakeCase = TRUE", { incremental = FALSE ) - # Export the results - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - fileNamesInSnakeCase = TRUE, - incremental = FALSE, - databaseId = "Eunomia" + expect_warning( + # Export the results + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + fileNamesInSnakeCase = TRUE, + incremental = FALSE, + databaseId = "Eunomia" + ) ) # Verify the files are written to the file system and are in snake_case @@ -176,13 +183,15 @@ test_that("Export cohort stats in incremental mode", { cohortTableNames = cohortTableNames ) - # Export the results - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - incremental = TRUE + expect_warning( + # Export the results + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + incremental = TRUE + ) ) # Verify the files are written to the file system @@ -212,15 +221,17 @@ test_that("Export cohort stats with camelCase for column names", { cohortDefinitionSet = cohortsWithStats ) - # Export the results - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - snakeCaseToCamelCase = TRUE, - fileNamesInSnakeCase = TRUE, - incremental = TRUE + expect_warning( + # Export the results + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + snakeCaseToCamelCase = TRUE, + fileNamesInSnakeCase = TRUE, + incremental = TRUE + ) ) # Verify the files are written to the file system and the columns are in @@ -231,16 +242,18 @@ test_that("Export cohort stats with camelCase for column names", { expect_true(all(isCamelCase(names(data)))) } - # Export the results again in incremental mode and verify - # the results are preserved - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - snakeCaseToCamelCase = TRUE, - fileNamesInSnakeCase = TRUE, - incremental = TRUE + expect_warning( + # Export the results again in incremental mode and verify + # the results are preserved + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + snakeCaseToCamelCase = TRUE, + fileNamesInSnakeCase = TRUE, + incremental = TRUE + ) ) # Verify the cohort_inc_stats.csv contains cohortDefinitionIds c(2,3) @@ -273,15 +286,17 @@ test_that("Export cohort stats with snake_case for column names", { cohortDefinitionSet = cohortsWithStats ) - # Export the results - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - snakeCaseToCamelCase = FALSE, - fileNamesInSnakeCase = TRUE, - incremental = TRUE + expect_warning( + # Export the results + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + snakeCaseToCamelCase = FALSE, + fileNamesInSnakeCase = TRUE, + incremental = TRUE + ) ) # Verify the files are written to the file system and the columns are in @@ -292,16 +307,18 @@ test_that("Export cohort stats with snake_case for column names", { expect_true(all(isSnakeCase(names(data)))) } - # Export the results again in incremental mode and verify - # the results are preserved - exportCohortStatsTables( - connectionDetails = connectionDetails, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - cohortStatisticsFolder = cohortStatsFolder, - snakeCaseToCamelCase = FALSE, - fileNamesInSnakeCase = TRUE, - incremental = TRUE + expect_warning( + # Export the results again in incremental mode and verify + # the results are preserved + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + snakeCaseToCamelCase = FALSE, + fileNamesInSnakeCase = TRUE, + incremental = TRUE + ) ) # Verify the cohort_inc_stats.csv contains cohort_definition_id == c(2,3) @@ -312,3 +329,48 @@ test_that("Export cohort stats with snake_case for column names", { expect_equal(unique(data$cohort_definition_id), c(2, 3)) unlink(cohortStatsFolder) }) + +test_that("Export cohort stats using cohortDefinitionSet for inclusion rule names", { + cohortTableNames <- getCohortTableNames(cohortTable = "cohortStatsInclRule") + cohortStatsFolder <- file.path(outputFolder, "stats") + # First create the cohort tables + createCohortTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames + ) + + # Generate with stats + cohortsWithStats <- getCohortsForTest(cohorts, generateStats = TRUE) + generateCohortSet( + connectionDetails = connectionDetails, + cohortDefinitionSet = cohortsWithStats, + cdmDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortDatabaseSchema = "main", + incremental = FALSE + ) + + # Export the results + exportCohortStatsTables( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + cohortStatisticsFolder = cohortStatsFolder, + incremental = FALSE, + databaseId = "Eunomia", + cohortDefinitionSet = cohortsWithStats + ) + + # Verify the files are written to the file system and that + # the cohort inclusion information has been written + exportedFiles <- list.files(path = cohortStatsFolder, pattern = ".csv", full.names = TRUE) + expect_true("cohortInclusion.csv" %in% basename(exportedFiles)) + for (i in 1:length(exportedFiles)) { + if (basename(exportedFiles[i]) == "cohortInclusion.csv") { + data <- CohortGenerator:::.readCsv(file = exportedFiles[i]) + expect_true(nrow(data) > 0) + } + } + unlink(cohortStatsFolder) +}) diff --git a/tests/testthat/test-NegativeControlCohorts.R b/tests/testthat/test-NegativeControlCohorts.R index f2444e6..46a5c9e 100644 --- a/tests/testthat/test-NegativeControlCohorts.R +++ b/tests/testthat/test-NegativeControlCohorts.R @@ -12,6 +12,38 @@ test_that("Call generateNegativeControlOutcomeCohorts without connection or conn expect_error(generateNegativeControlOutcomeCohorts()) }) +test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer cohort ID", { + negativeControlOutcomeCohortSet <- data.frame( + cohortId = 1.2, + cohortName = "invalid cohort id", + outcomeConceptId = 1 + ) + expect_error( + generateNegativeControlOutcomeCohorts( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet + ), + message = "(non-integer values)" + ) +}) + +test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer outcome concept ID", { + negativeControlOutcomeCohortSet <- data.frame( + cohortId = 1, + cohortName = "invalid outcome concept id", + outcomeConceptId = 1.2 + ) + expect_error( + generateNegativeControlOutcomeCohorts( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet + ), + message = "(non-integer values)" + ) +}) + test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing duplicate IDs", { negativeControlOutcomeCohortSet <- data.frame( cohortId = 1, diff --git a/tests/testthat/test-ResultsDataModel.R b/tests/testthat/test-ResultsDataModel.R new file mode 100644 index 0000000..f8ae0aa --- /dev/null +++ b/tests/testthat/test-ResultsDataModel.R @@ -0,0 +1,150 @@ +library(CohortGenerator) +library(testthat) + +if (dir.exists(Sys.getenv("DATABASECONNECTOR_JAR_FOLDER"))) { + jdbcDriverFolder <- Sys.getenv("DATABASECONNECTOR_JAR_FOLDER") +} else { + jdbcDriverFolder <- "~/.jdbcDrivers" + dir.create(jdbcDriverFolder, showWarnings = FALSE) + DatabaseConnector::downloadJdbcDrivers("postgresql", pathToDriver = jdbcDriverFolder) + withr::defer( + { + unlink(jdbcDriverFolder, recursive = TRUE, force = TRUE) + }, + testthat::teardown_env() + ) +} + +postgresConnectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "postgresql", + user = Sys.getenv("CDM5_POSTGRESQL_USER"), + password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")), + server = Sys.getenv("CDM5_POSTGRESQL_SERVER"), + pathToDriver = jdbcDriverFolder +) + +postgresResultsDatabaseSchema <- paste0("r", Sys.getpid(), format(Sys.time(), "%s"), sample(1:100, 1)) + +databaseFile <- tempfile(fileext = ".sqlite") +sqliteConnectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "sqlite", + server = databaseFile +) +sqliteResultsDatabaseSchema <- "main" + +withr::defer( + { + connection <- DatabaseConnector::connect(connectionDetails = postgresConnectionDetails) + sql <- "DROP SCHEMA IF EXISTS @resultsDatabaseSchema CASCADE;" + DatabaseConnector::renderTranslateExecuteSql( + sql = sql, + resultsDatabaseSchema = postgresResultsDatabaseSchema, + connection = connection + ) + + DatabaseConnector::disconnect(connection) + unlink(databaseFile, force = TRUE) + }, + testthat::teardown_env() +) + +testCreateSchema <- function(connectionDetails, resultsDatabaseSchema) { + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + if (connectionDetails$dbms != "sqlite") { + sql <- "CREATE SCHEMA @resultsDatabaseSchema;" + DatabaseConnector::renderTranslateExecuteSql( + sql = sql, + resultsDatabaseSchema = resultsDatabaseSchema, + connection = connection + ) + } + suppressWarnings( + createResultsDataModel( + connectionDetails = connectionDetails, + databaseSchema = resultsDatabaseSchema, + ) + ) + specifications <- getResultsDataModelSpecifications() + for (tableName in unique(specifications$tableName)) { + expect_true(DatabaseConnector::existsTable( + connection = connection, + databaseSchema = resultsDatabaseSchema, + tableName = tableName + )) + } + # Bad schema name + expect_error(createResultsDataModel( + connectionDetails = connectionDetails, + databaseSchema = "non_existant_schema" + )) +} + +test_that("Create schema", { + testCreateSchema( + connectionDetails = postgresConnectionDetails, + resultsDatabaseSchema = postgresResultsDatabaseSchema + ) + testCreateSchema( + connectionDetails = sqliteConnectionDetails, + resultsDatabaseSchema = sqliteResultsDatabaseSchema + ) +}) + +testUploadResults <- function(connectionDetails, resultsDatabaseSchema, resultsFolder) { + uploadResults( + connectionDetails = connectionDetails, + schema = resultsDatabaseSchema, + resultsFolder = resultsFolder, + purgeSiteDataBeforeUploading = FALSE + ) + + # Check if there's data: + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + specifications <- getResultsDataModelSpecifications() + for (tableName in unique(specifications$tableName)) { + primaryKey <- specifications %>% + dplyr::filter(tableName == !!tableName & + primaryKey == "Yes") %>% + dplyr::select(columnName) %>% + dplyr::pull() + + if ("database_id" %in% primaryKey) { + sql <- "SELECT COUNT(*) FROM @database_schema.@table_name WHERE database_id = '@database_id';" + databaseIdCount <- DatabaseConnector::renderTranslateQuerySql( + connection = connection, + sql = sql, + database_schema = resultsDatabaseSchema, + table_name = tableName, + database_id = "Eunomia" + )[, 1] + expect_true(databaseIdCount >= 0) + } + } +} + +test_that("Results upload", { + unzipFolder <- tempfile("unzipTempFolder", tmpdir = tempdir()) + dir.create(path = unzipFolder, recursive = TRUE) + on.exit(unlink(unzipFolder, recursive = TRUE), add = TRUE) + + zip::unzip( + zipfile = system.file( + "testdata/Results_Eunomia.zip", + package = "CohortGenerator" + ), + exdir = unzipFolder + ) + testUploadResults( + connectionDetails = postgresConnectionDetails, + resultsDatabaseSchema = postgresResultsDatabaseSchema, + resultsFolder = unzipFolder + ) + testUploadResults( + connectionDetails = sqliteConnectionDetails, + resultsDatabaseSchema = sqliteResultsDatabaseSchema, + resultsFolder = unzipFolder + ) +}) diff --git a/tests/testthat/test-RunCohortGeneration.R b/tests/testthat/test-RunCohortGeneration.R new file mode 100644 index 0000000..eeb38b7 --- /dev/null +++ b/tests/testthat/test-RunCohortGeneration.R @@ -0,0 +1,76 @@ +library(testthat) +library(CohortGenerator) + +# Exception Handling ------------- +test_that("Call runCohortGeneration without connectionDetails", { + expect_error(runCohortGeneration(), message = "(connection details)") +}) + +test_that("Call runCohortGeneration without connectionDetails", { + expect_error( + runCohortGeneration( + connectionDetails = connectionDetails + ), + message = "(You must supply at least 1 cohortDefinitionSet OR 1 negativeControlOutcomeCohortSet)" + ) +}) + +test_that("Call runCohortGeneration happy path", { + testOutputFolder <- file.path(outputFolder, "runCG") + on.exit(unlink(testOutputFolder, recursive = TRUE)) + cohortsWithStats <- getCohortsForTest(cohorts, generateStats = TRUE) + ncSet <- getNegativeControlOutcomeCohortsForTest() + expectedDatabaseId <- "db1" + + runCohortGeneration( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTableNames = CohortGenerator::getCohortTableNames("runCG"), + cohortDefinitionSet = cohortsWithStats, + negativeControlOutcomeCohortSet = ncSet, + occurrenceType = "all", + detectOnDescendants = TRUE, + stopOnError = FALSE, + outputFolder = testOutputFolder, + databaseId = expectedDatabaseId, + incremental = F + ) + + # Ensure the resultsDataModelSpecification.csv is written + # to the output folder + expect_true(file.exists(file.path(testOutputFolder, "resultsDataModelSpecification.csv"))) + + # Make sure the output includes a file for every table in the spec + spec <- CohortGenerator::readCsv( + file = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator") + ) + expectedFileList <- paste0(unique(spec$tableName), ".csv") + diffs <- setdiff(expectedFileList, basename(list.files(testOutputFolder))) + expect_true(length(diffs) == 0) + + # Make sure that each output file contains the same columns as defined + # in the specification + for (i in seq_along(expectedFileList)) { + data <- readCsv( + file = file.path(testOutputFolder, expectedFileList[i]) + ) + tbl <- tools::file_path_sans_ext(expectedFileList[i]) + + emptyResult <- CohortGenerator:::createEmptyResult(tbl) + expect_equal(!!c(tbl, sort(names(data))), !!c(tbl, sort(names(emptyResult)))) + } + + # Make sure that the output that specifies a database ID has the correct + # value included + tablesWithDatabaseId <- spec %>% + dplyr::filter(columnName == "database_id") + for (i in seq_along(tablesWithDatabaseId)) { + # Read in the data and ensure all of the database_ids match the + # the one used in the test + data <- CohortGenerator::readCsv( + file = file.path(testOutputFolder, paste0(tablesWithDatabaseId$tableName[i], ".csv")) + ) + expect_true(all(data$databaseId == expectedDatabaseId)) + } +}) diff --git a/tests/testthat/test-Subsets.R b/tests/testthat/test-Subsets.R index 446ade2..7569431 100644 --- a/tests/testthat/test-Subsets.R +++ b/tests/testthat/test-Subsets.R @@ -242,13 +242,13 @@ test_that("subset generation", { checkmate::expect_list(getSubsetDefinitions(cohortDefinitionSetWithSubset), min.len = 1, types = "CohortSubsetDefinition") - expect_true(nrow(cohortDefinitionSetWithSubset) == 6) + expect_true(nrow(cohortDefinitionSetWithSubset) == 8) # Test only applying to a subset cohortDefinitionSetWithSubset2 <- cohortDefinitionSet %>% addCohortSubsetDefinition(subsetDef, targetCohortIds = c(1, 2)) - expect_true(nrow(cohortDefinitionSetWithSubset2) == 5) + expect_true(nrow(cohortDefinitionSetWithSubset2) == 6) expect_true(attr(cohortDefinitionSetWithSubset, "hasSubsetDefinitions")) expect_true("isSubset" %in% colnames(cohortDefinitionSetWithSubset)) @@ -453,7 +453,7 @@ test_that("Subset name templates function", { CohortGenerator::addCohortSubsetDefinition(subsetDef) # Check name templates are applied - expect_true(all(grepl("FOOO (.+) test definition 123 Demographic Criteria 1zzzzDemographic Criteria 2", cohortDefinitionSetWithSubset$cohortName[4:6]))) + expect_true(all(grepl("FOOO (.+) test definition 123 Demographic Criteria 1zzzzDemographic Criteria 2", cohortDefinitionSetWithSubset$cohortName[5:8]))) # Internal copy call cds2 <- .copySubsetDefinitions(cohortDefinitionSet, cohortDefinitionSetWithSubset) @@ -461,3 +461,452 @@ test_that("Subset name templates function", { checkmate::expect_list(attr(cds2, "cohortSubsetDefinitions")) expect_true(attr(cds2, "hasSubsetDefinitions")) }) + +test_that("Subset logic checks", { + databaseFile <- tempfile(fileext = ".sqlite") + sqliteConnectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "sqlite", + server = databaseFile + ) + sqliteResultsDatabaseSchema <- "main" + connection <- DatabaseConnector::connect(sqliteConnectionDetails) + withr::defer( + { + DatabaseConnector::disconnect(connection) + unlink(databaseFile, force = TRUE) + }, + testthat::teardown_env() + ) + + # Create dummy OMOP data for testing ------------------ + DatabaseConnector::insertTable( + connection = connection, + databaseSchema = sqliteResultsDatabaseSchema, + tableName = "observation_period", + data = data.frame( + observation_period_id = 1, + person_id = 1, + observation_period_start_date = lubridate::date("2000-01-01"), + observation_period_end_date = lubridate::date("2008-12-31") + ) + ) + + DatabaseConnector::insertTable( + connection = connection, + databaseSchema = sqliteResultsDatabaseSchema, + tableName = "person", + data = data.frame( + person_id = 1, + gender_concept_id = 8532, + year_of_birth = 2000, + race_concept_id = 0, + ethnicity_concept_id = 0 + ) + ) + + + # Define limit subsets for tests ------------- + lsd1 <- createCohortSubsetDefinition( + name = "first ever", + definitionId = 101, + subsetOperators = list( + createLimitSubset( + name = "first ever", + limitTo = "firstEver" + ) + ) + ) + + lsd2 <- createCohortSubsetDefinition( + name = "earliestRemaining", + definitionId = 102, + subsetOperators = list( + createLimitSubset( + name = "earliestRemaining", + limitTo = "earliestRemaining", + priorTime = 500 + ) + ) + ) + + lsd3 <- createCohortSubsetDefinition( + name = "latestRemaining", + definitionId = 103, + subsetOperators = list( + createLimitSubset( + name = "latestRemaining", + limitTo = "latestRemaining", + followUpTime = 800 + ) + ) + ) + + lsd4 <- createCohortSubsetDefinition( + name = "lastEver", + definitionId = 104, + subsetOperators = list( + createLimitSubset( + name = "lastEver", + limitTo = "lastEver" + ) + ) + ) + + lsd5 <- createCohortSubsetDefinition( + name = "calendar", + definitionId = 105, + subsetOperators = list( + createLimitSubset( + name = "2003 - 2006", + calendarStartDate = "2003-01-01", + calendarEndDate = "2006-12-31", + ) + ) + ) + + lsd6 <- createCohortSubsetDefinition( + name = "firstEver + calendar", + definitionId = 106, + subsetOperators = list( + createLimitSubset( + limitTo = "firstEver", + name = "2003 - 2006", + calendarStartDate = "2003-01-01", + calendarEndDate = "2006-12-31", + ) + ) + ) + + lsd7 <- createCohortSubsetDefinition( + name = "earliestRemaining + calendar", + definitionId = 107, + subsetOperators = list( + createLimitSubset( + limitTo = "earliestRemaining", + name = "2003 - 2006", + priorTime = 500, + calendarStartDate = "2003-01-01", + calendarEndDate = "2006-12-31", + ) + ) + ) + + # Define demographics subsets for tests ------------- + ds1 <- createCohortSubsetDefinition( + name = "Age subset", + definition = 201, + subsetOperators = list( + createDemographicSubset( + name = "Age 2-5", + ageMin = 2, + ageMax = 5 + ) + ) + ) + + ds2 <- createCohortSubsetDefinition( + name = "Gender subset", + definition = 202, + subsetOperators = list( + createDemographicSubset( + name = "Gender = 8532", + gender = 8532 + ) + ) + ) + + ds3 <- createCohortSubsetDefinition( + name = "Race subset", + definition = 203, + subsetOperators = list( + createDemographicSubset( + name = "Race = 0", + race = 0 + ) + ) + ) + + ds4 <- createCohortSubsetDefinition( + name = "Race subset", + definition = 204, + subsetOperators = list( + createDemographicSubset( + name = "Ethnicity = 0", + ethnicity = 0 + ) + ) + ) + + # Define cohort subsets for tests ------------- + cs1 <- createCohortSubsetDefinition( + name = "Subset overlaps cohort start", + definition = 301, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetStart", + cohortIds = c(2), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortStart"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortStart") + ) + ) + ) + + cs2 <- createCohortSubsetDefinition( + name = "Subset overlaps entire target cohort period", + definition = 302, + subsetOperators = list( + createCohortSubset( + name = "subsetSubsumesTarget", + cohortIds = c(3), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, -1, "cohortStart"), + endWindow = createSubsetCohortWindow(1, 99999, "cohortEnd") + ) + ) + ) + + cs3 <- createCohortSubsetDefinition( + name = "Subset subsumed by entire target cohort period", + definition = 303, + subsetOperators = list( + createCohortSubset( + name = "targetSubsumesSubset", + cohortIds = c(4), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(1, 99999, "cohortStart"), + endWindow = createSubsetCohortWindow(-99999, 1, "cohortEnd") + ) + ) + ) + + cs4 <- createCohortSubsetDefinition( + name = "Subset overlaps cohort end", + definition = 304, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetEnd", + cohortIds = c(5), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortEnd"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortEnd") + ) + ) + ) + + cs5 <- createCohortSubsetDefinition( + name = "Subset does NOT overlap cohort end - negate", + definition = 305, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetEndNegate", + cohortIds = c(5), + negate = T, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortEnd"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortEnd") + ) + ) + ) + + cs6 <- createCohortSubsetDefinition( + name = "Subset overlaps target start - tests combo == all", + definition = 306, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetStartComboAll", + cohortIds = c(2, 3), + negate = F, + cohortCombinationOperator = "all", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortStart"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortStart") + ) + ) + ) + + # Create cohort def. set and apply subset definitions --------- + cohortDefinitionSet <- data.frame( + cohortId = 1, + cohortName = "Test Target Cohort", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2001, 01, 01), DATEFROMPARTS(2002, 01, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 01, 01), DATEFROMPARTS(2004, 01, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2005, 01, 01), DATEFROMPARTS(2006, 01, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2007, 01, 01), DATEFROMPARTS(2008, 01, 01) + ;", + json = "" + ) + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 2, + cohortName = "Test Subset 1 - Subset Overlaps Target Start Date", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2000, 01, 01), DATEFROMPARTS(2001, 12, 31) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2002, 01, 01), DATEFROMPARTS(2003, 12, 31) + UNION + -- NOTE: DOES NOT OVERLAP COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2004, 01, 01), DATEFROMPARTS(2004, 12, 31) + ;", + json = "" + ) + ) + + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 3, + cohortName = "Test Subset 2 - Subset start+end subsumes target start+end", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2000, 01, 01), DATEFROMPARTS(2003, 12, 31) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2002, 01, 01), DATEFROMPARTS(2005, 12, 31) + UNION + -- NOTE: DOES NOT FULLY SUBSUME COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2004, 01, 01), DATEFROMPARTS(2005, 12, 31) + ;", + json = "" + ) + ) + + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 4, + cohortName = "Test Subset 3 - Target start+end subsumes Subset start+end", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2001, 02, 01), DATEFROMPARTS(2001, 12, 31) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 02, 01), DATEFROMPARTS(2003, 12, 31) + UNION + -- NOTE: IS NOT FULLY SUBSUMED BY COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2004, 01, 01), DATEFROMPARTS(2005, 12, 31) + ;", + json = "" + ) + ) + + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 5, + cohortName = "Test Subset 4 - Subset Overlaps Target End Date", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2001, 02, 01), DATEFROMPARTS(2002, 02, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 02, 01), DATEFROMPARTS(2004, 02, 01) + UNION + -- NOTE: DOES NOT OVERLAP ANY END DATE ENTRIES IN COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 02, 01), DATEFROMPARTS(2003, 03, 01) + ;", + json = "" + ) + ) + + cohortDefinitionSet <- cohortDefinitionSet |> + addCohortSubsetDefinition(lsd1, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd2, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd3, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd4, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd5, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd6, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd7, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds1, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds2, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds3, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds4, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs1, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs2, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs3, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs4, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs5, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs6, targetCohortIds = c(1)) + + # Generate cohorts ------------ + cohortTableNames <- getCohortTableNames() + + createCohortTables( + connection = connection, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames + ) + + generateCohortSet( + connection = connection, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTableNames = getCohortTableNames(), + cohortDefinitionSet = cohortDefinitionSet + ) + + + cohorts <- DatabaseConnector::querySql( + connection = connection, + sql = "SELECT * FROM main.cohort ORDER BY COHORT_DEFINITION_ID, SUBJECT_ID, COHORT_START_DATE;" + ) + + # Check the cohort counts to verify the logic worked as expected --------- + # cohorts # <------ USE TO SEE THE COHORTS TO VERIFY THE INFO BELOW + + # Limit subsets cohort definition 1100 range ------ + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1101, ]$COHORT_START_DATE[[1]], lubridate::date("2001-01-01")) # 1101 - First Ever + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1102, ]$COHORT_START_DATE[[1]], lubridate::date("2003-01-01")) # 1102 - Earliest Remaining + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1103, ]$COHORT_START_DATE[[1]], lubridate::date("2005-01-01")) # 1103 - Latest Remaining + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1104, ]$COHORT_START_DATE[[1]], lubridate::date("2007-01-01")) # 1104 - Last Ever + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1105, ]$COHORT_START_DATE[[1]], lubridate::date("2003-01-01")) # 1105 - Calendar #1 + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1105, ]$COHORT_START_DATE[[2]], lubridate::date("2005-01-01")) # 1105 - Calendar #2 + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1106, ]), 0) # 1106 - First ever + calendar time that restricts to no one + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1107, ]$COHORT_START_DATE[[1]], lubridate::date("2003-01-01")) # 1107 - Earliest remaining+calendar restriction + + # Demographic subsets cohort definition 1200 range ------ + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1201, ]), 2) # 1201 - Age 2-5 + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1202, ]), 4) # 1202 - Gender + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1203, ]), 4) # 1203 - Race + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1204, ]), 4) # 1204 - Ethnicity + + # Cohort subsets cohort definition 1300 range ------ + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1301, ]), 2) # 1301 - Subset overlaps cohort start + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1302, ]), 2) # 1302 - Subset overlaps entire target cohort period + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1303, ]), 2) # 1303 - Subset subsumed by entire target cohort period + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1304, ]), 2) # 1304 - Subset overlaps cohort end + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1305, ]), 2) # 1305 - Subset does NOT overlap cohort end - negate + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1306, ]), 2) # 1306 - Subset overlaps target start - tests combo == all +}) diff --git a/tests/testthat/test-dbms-platforms.R b/tests/testthat/test-dbms-platforms.R index 4332f74..14da170 100644 --- a/tests/testthat/test-dbms-platforms.R +++ b/tests/testthat/test-dbms-platforms.R @@ -1,5 +1,9 @@ +library(testthat) +library(CohortGenerator) + testPlatform <- function(dbmsDetails) { cohortTableNames <- getCohortTableNames(cohortTable = dbmsDetails$cohortTable) + platformOutputFolder <- file.path(outputFolder, dbmsDetails$connectionDetails$dbms) on.exit({ dropCohortStatsTables( connectionDetails = dbmsDetails$connectionDetails, @@ -7,13 +11,9 @@ testPlatform <- function(dbmsDetails) { cohortTableNames = cohortTableNames, dropCohortTable = TRUE ) + unlink(platformOutputFolder, recursive = TRUE) }) - createCohortTables( - connectionDetails = dbmsDetails$connectionDetails, - cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, - cohortTableNames = cohortTableNames - ) cohortsWithStats <- getCohortDefinitionSet( settingsFileName = "testdata/name/Cohorts.csv", jsonFolder = "testdata/name/cohorts", @@ -23,47 +23,6 @@ testPlatform <- function(dbmsDetails) { packageName = "CohortGenerator", verbose = FALSE ) - - cohortsGenerated <- generateCohortSet( - connectionDetails = dbmsDetails$connectionDetails, - cdmDatabaseSchema = dbmsDetails$cdmDatabaseSchema, - cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, - cohortTableNames = cohortTableNames, - cohortDefinitionSet = cohortsWithStats, - incremental = TRUE, - incrementalFolder = file.path(outputFolder, "RecordKeeping", dbmsDetails$connectionDetails$dbms) - ) - expect_equal(nrow(cohortsGenerated), nrow(cohortsWithStats)) - - # Get the cohort counts - cohortCounts <- getCohortCounts( - connectionDetails = dbmsDetails$connectionDetails, - cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, - cohortTable = cohortTableNames$cohortTable, - databaseId = dbmsDetails$dbmsPlatform, - cohortDefinitionSet = cohortsWithStats - ) - expect_equal(nrow(cohortsGenerated), nrow(cohortCounts)) - - # Insert the inclusion rule names before exporting the stats tables - insertInclusionRuleNames( - connectionDetails = dbmsDetails$connectionDetails, - cohortDefinitionSet = cohortsWithStats, - cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, - cohortInclusionTable = cohortTableNames$cohortInclusionTable - ) - - exportCohortStatsTables( - connectionDetails = dbmsDetails$connectionDetails, - cohortTableNames = cohortTableNames, - cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, - cohortStatisticsFolder = file.path(outputFolder, dbmsDetails$dbmsPlatform), - snakeCaseToCamelCase = FALSE, - fileNamesInSnakeCase = TRUE, - incremental = TRUE, - databaseId = dbmsDetails$dbmsPlatform - ) - subsetOperations <- list( createCohortSubset( cohortIds = 2, @@ -91,16 +50,38 @@ testPlatform <- function(dbmsDetails) { subsetOperators = subsetOperations ) cohortsWithSubsets <- addCohortSubsetDefinition(cohortsWithStats, subsetDef) - cohortsGenerated <- generateCohortSet( + + ncSet <- getNegativeControlOutcomeCohortsForTest() + + runCohortGeneration( connectionDetails = dbmsDetails$connectionDetails, cdmDatabaseSchema = dbmsDetails$cdmDatabaseSchema, cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, cohortTableNames = cohortTableNames, cohortDefinitionSet = cohortsWithSubsets, - incremental = TRUE, - incrementalFolder = file.path(outputFolder, "RecordKeeping", dbmsDetails$connectionDetails$dbms) + negativeControlOutcomeCohortSet = ncSet, + occurrenceType = "first", + detectOnDescendants = TRUE, + outputFolder = platformOutputFolder, + databaseId = dbmsDetails$connectionDetails$dbms, + incremental = F + ) + + # Check the output to verify the generation worked properly + cohortsGenerated <- readCsv( + file = file.path(platformOutputFolder, "cg_cohort_generation.csv") ) expect_equal(nrow(cohortsGenerated), nrow(cohortsWithSubsets)) + + cohortCounts <- readCsv( + file = file.path(platformOutputFolder, "cg_cohort_count.csv") + ) + expect_equal(nrow(cohortsGenerated), nrow(cohortCounts)) + + ncCohortCounts <- readCsv( + file = file.path(platformOutputFolder, "cg_cohort_count_neg_ctrl.csv") + ) + expect_equal(nrow(ncSet), nrow(ncCohortCounts)) } # This file contains platform specific tests @@ -110,8 +91,9 @@ test_that("platform specific create cohorts with stats, Incremental, get results for (dbmsPlatform in dbmsPlatforms) { dbmsDetails <- getPlatformConnectionDetails(dbmsPlatform) if (is.null(dbmsDetails)) { - print(paste("No pltatform details available for", dbmsPlatform)) + print(paste("No platform details available for", dbmsPlatform)) } else { + print(paste("Testing", dbmsPlatform)) testPlatform(dbmsDetails) } }