From 7b34d03a96b6e83b6cb9d307c69ca6559f6cd82b Mon Sep 17 00:00:00 2001 From: Anthony Sena Date: Wed, 12 Jun 2024 17:08:09 -0400 Subject: [PATCH] Initial RMM implementation --- DESCRIPTION | 4 +- NAMESPACE | 5 + R/CohortCount.R | 2 +- R/CohortDefinitionSet.R | 4 +- R/CohortSample.R | 2 +- R/CohortTables.R | 2 +- R/NegativeControlCohorts.R | 6 +- R/ResultsDataModel.R | 156 ++++++++++++++++++ R/SubsetDefinitions.R | 2 +- R/SubsetQueryBuilders.R | 6 +- ...tDefinitionSetSpecificationDescription.csv | 0 ...tcomeCohortSetSpecificationDescription.csv | 0 inst/csv/resultsDataModelSpecification.csv | 46 ++++++ .../sql/sql_server/CreateResultsDataModel.sql | 91 ++++++++++ man/createResultsDataModel.Rd | 25 +++ man/getDataMigrator.Rd | 21 +++ man/getResultsDataModelSpecifications.Rd | 14 ++ man/migrateDataModel.Rd | 21 +++ man/uploadResults.Rd | 45 +++++ tests/testthat/helper.R | 2 +- tests/testthat/test-CohortDefinitionSet.R | 2 +- tests/testthat/test-ResultsDataModel.R | 126 ++++++++++++++ tests/testthat/test-Subsets.R | 2 +- 23 files changed, 568 insertions(+), 16 deletions(-) create mode 100644 R/ResultsDataModel.R rename inst/{ => csv}/cohortDefinitionSetSpecificationDescription.csv (100%) rename inst/{ => csv}/negativeControlOutcomeCohortSetSpecificationDescription.csv (100%) create mode 100644 inst/csv/resultsDataModelSpecification.csv create mode 100644 inst/sql/sql_server/CreateResultsDataModel.sql create mode 100644 man/createResultsDataModel.Rd create mode 100644 man/getDataMigrator.Rd create mode 100644 man/getResultsDataModelSpecifications.Rd create mode 100644 man/migrateDataModel.Rd create mode 100644 man/uploadResults.Rd create mode 100644 tests/testthat/test-ResultsDataModel.R diff --git a/DESCRIPTION b/DESCRIPTION index 9e9abc7..e3d8f4d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,7 @@ Imports: rlang, RJSONIO, jsonlite, + ResultModelManager, SqlRender (>= 1.11.1), stringi (>= 1.7.6) Suggests: @@ -38,7 +39,8 @@ Suggests: testthat, withr Remotes: - ohdsi/ROhdsiWebApi + ohdsi/ResultModelManager, + ohdsi/ROhdsiWebApi, License: Apache License VignetteBuilder: knitr URL: https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator diff --git a/NAMESPACE b/NAMESPACE index e5a3448..dbbb12e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(createDemographicSubset) export(createEmptyCohortDefinitionSet) export(createEmptyNegativeControlOutcomeCohortSet) export(createLimitSubset) +export(createResultsDataModel) export(createSubsetCohortWindow) export(dropCohortStatsTables) export(exportCohortStatsTables) @@ -26,7 +27,9 @@ export(getCohortDefinitionSet) export(getCohortInclusionRules) export(getCohortStats) export(getCohortTableNames) +export(getDataMigrator) export(getRequiredTasks) +export(getResultsDataModelSpecifications) export(getSubsetDefinitions) export(insertInclusionRuleNames) export(isCamelCase) @@ -34,12 +37,14 @@ export(isCohortDefinitionSet) export(isFormattedForDatabaseUpload) export(isSnakeCase) export(isTaskRequired) +export(migrateDataModel) export(readCsv) export(recordTasksDone) export(sampleCohortDefinitionSet) export(saveCohortDefinitionSet) export(saveCohortSubsetDefinition) export(saveIncremental) +export(uploadResults) export(writeCsv) import(DatabaseConnector) import(R6) diff --git a/R/CohortCount.R b/R/CohortCount.R index a89e483..faf1378 100644 --- a/R/CohortCount.R +++ b/R/CohortCount.R @@ -55,7 +55,7 @@ getCohortCounts <- function(connectionDetails = NULL, on.exit(DatabaseConnector::disconnect(connection)) } - sql <- SqlRender::readSql(system.file("sql/sql_server/CohortCounts.sql", package = "CohortGenerator", mustWork = TRUE)) + sql <- SqlRender::readSql(system.file("sql/sql_server/CohortCounts.sql", package = utils::packageName(), mustWork = TRUE)) sql <- SqlRender::render( sql = sql, cohort_database_schema = cohortDatabaseSchema, diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R index 42a2113..10668b4 100644 --- a/R/CohortDefinitionSet.R +++ b/R/CohortDefinitionSet.R @@ -183,8 +183,8 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit #' @noRd #' @keywords internal .getCohortDefinitionSetSpecification <- function() { - return(readCsv(system.file("cohortDefinitionSetSpecificationDescription.csv", - package = "CohortGenerator", + return(readCsv(system.file("csv", "cohortDefinitionSetSpecificationDescription.csv", + package = utils::packageName(), mustWork = TRUE ))) } diff --git a/R/CohortSample.R b/R/CohortSample.R index e522244..8c236ff 100644 --- a/R/CohortSample.R +++ b/R/CohortSample.R @@ -79,7 +79,7 @@ tableName = randSampleTableName ) - execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", package = "CohortGenerator")) + execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", utils::packageName())) DatabaseConnector::renderTranslateExecuteSql(connection, execSql, tempEmulationSchema = tempEmulationSchema, diff --git a/R/CohortTables.R b/R/CohortTables.R index 19a57c7..c07e3c2 100644 --- a/R/CohortTables.R +++ b/R/CohortTables.R @@ -110,7 +110,7 @@ createCohortTables <- function(connectionDetails = NULL, yes = FALSE, no = (createTableFlagList$cohortSampleTable && cohortTableNames$cohortSampleTable != cohortTableNames$cohortTable) ) - sql <- SqlRender::readSql(system.file("sql/sql_server/CreateCohortTables.sql", package = "CohortGenerator", mustWork = TRUE)) + sql <- SqlRender::readSql(system.file("sql/sql_server/CreateCohortTables.sql", utils::packageName(), mustWork = TRUE)) sql <- SqlRender::render( sql = sql, cohort_database_schema = cohortDatabaseSchema, diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index 51e4691..e07fbeb 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -52,8 +52,8 @@ createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) { #' @noRd #' @keywords internal .getNegativeControlOutcomeCohortSetSpecification <- function() { - return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv", - package = "CohortGenerator", + return(readCsv(system.file("csv", "negativeControlOutcomeCohortSetSpecificationDescription.csv", + package = utils::packageName(), mustWork = TRUE ))) } @@ -214,7 +214,7 @@ createNegativeControlOutcomesQuery <- function(connection, cohortTable, occurrenceType, detectOnDescendants) { - sql <- sql <- SqlRender::readSql(system.file("sql/sql_server/NegativeControlOutcomes.sql", package = "CohortGenerator", mustWork = TRUE)) + sql <- sql <- SqlRender::readSql(system.file("sql/sql_server/NegativeControlOutcomes.sql", utils::packageName(), mustWork = TRUE)) sql <- SqlRender::render( sql = sql, cdm_database_schema = cdmDatabaseSchema, diff --git a/R/ResultsDataModel.R b/R/ResultsDataModel.R new file mode 100644 index 0000000..a849fc6 --- /dev/null +++ b/R/ResultsDataModel.R @@ -0,0 +1,156 @@ +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of SelfControlledCaseSeries +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#' Get specifications for CohortGenerator results data model +#' +#' @return +#' A tibble data frame object with specifications +#' +#' @export +getResultsDataModelSpecifications <- function() { + resultsDataModelSpecifications <- readCsv( + file = system.file("csv", "resultsDataModelSpecification.csv", package = utils::packageName()) + ) + return(resultsDataModelSpecifications) +} + +#' Create the results data model tables on a database server. +#' +#' @details +#' Only PostgreSQL and SQLite servers are supported. +#' +#' @param connectionDetails DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails] +#' @param databaseSchema The schema on the server where the tables will be created. +#' @param tablePrefix (Optional) string to insert before table names for database table names +#' @export +createResultsDataModel <- function(connectionDetails = NULL, + databaseSchema, + tablePrefix = "") { + if (connectionDetails$dbms == "sqlite" & databaseSchema != "main") { + stop("Invalid schema for sqlite, use databaseSchema = 'main'") + } + + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + # Create first version of results model: + sql <- SqlRender::loadRenderTranslateSql( + sqlFilename = "CreateResultsDataModel.sql", + packageName = utils::packageName(), + dbms = connection@dbms, + database_schema = databaseSchema, + table_prefix = tablePrefix + ) + DatabaseConnector::executeSql(connection, sql) + # Migrate to current version: + migrateDataModel( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix + ) +} + +#' Upload results to the database server. +#' +#' @description +#' Requires the results data model tables have been created using the \code{\link{createResultsDataModel}} function. +#' +#' @param connectionDetails An object of type \code{connectionDetails} as created using the +#' \code{\link[DatabaseConnector]{createConnectionDetails}} function in the +#' DatabaseConnector package. +#' @param schema The schema on the server where the tables have been created. +#' @param zipFileName The name of the zip file. +#' @param forceOverWriteOfSpecifications If TRUE, specifications of the phenotypes, cohort definitions, and analysis +#' will be overwritten if they already exist on the database. Only use this if these specifications +#' have changed since the last upload. +#' @param purgeSiteDataBeforeUploading If TRUE, before inserting data for a specific databaseId all the data for +#' that site will be dropped. This assumes the input zip file contains the full data for that +#' data site. +#' @param tempFolder A folder on the local file system where the zip files are extracted to. Will be cleaned +#' up when the function is finished. Can be used to specify a temp folder on a drive that +#' has sufficient space if the default system temp space is too limited. +#' @param tablePrefix (Optional) string to insert before table names for database table names +#' @param ... See ResultModelManager::uploadResults +#' @export +uploadResults <- function(connectionDetails, + schema, + zipFileName, + forceOverWriteOfSpecifications = FALSE, + purgeSiteDataBeforeUploading = TRUE, + tempFolder = tempdir(), + tablePrefix = "", + ...) { + unzipFolder <- tempfile("unzipTempFolder", tmpdir = tempFolder) + dir.create(path = unzipFolder, recursive = TRUE) + on.exit(unlink(unzipFolder, recursive = TRUE), add = TRUE) + + ParallelLogger::logInfo("Unzipping ", zipFileName) + zip::unzip(zipFileName, exdir = unzipFolder) + + ResultModelManager::uploadResults( + connectionDetails = connectionDetails, + schema = schema, + resultsFolder = unzipFolder, + tablePrefix = tablePrefix, + forceOverWriteOfSpecifications = forceOverWriteOfSpecifications, + purgeSiteDataBeforeUploading = purgeSiteDataBeforeUploading, + runCheckAndFixCommands = FALSE, + specifications = getResultsDataModelSpecifications(), + warnOnMissingTable = FALSE, + ... + ) +} + +#' Migrate Data model +#' @description +#' Migrate data from current state to next state +#' +#' It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +#' are using a postgres backend) or have kept the csv/zip files from your data generation. +#' +#' @inheritParams getDataMigrator +#' @export +migrateDataModel <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ParallelLogger::logInfo("Migrating data set") + migrator <- getDataMigrator(connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix) + migrator$executeMigrations() + migrator$finalize() +} + +#' Get database migrations instance +#' @description +#' +#' Returns ResultModelManager DataMigrationsManager instance. +# '@seealso [ResultModelManager::DataMigrationManager] which this function is a utility for. +#' +#' @param connectionDetails DatabaseConnector connection details object +#' @param databaseSchema String schema where database schema lives +#' @param tablePrefix (Optional) Use if a table prefix is used before table names (e.g. "cd_") +#' @returns Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models +#' @export +getDataMigrator <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ResultModelManager::DataMigrationManager$new( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix, + packageTablePrefix = "cg_", + migrationPath = "migrations", + packageName = utils::packageName() + ) +} \ No newline at end of file diff --git a/R/SubsetDefinitions.R b/R/SubsetDefinitions.R index 4453fd6..3375595 100644 --- a/R/SubsetDefinitions.R +++ b/R/SubsetDefinitions.R @@ -112,7 +112,7 @@ CohortSubsetDefinition <- R6::R6Class( dropTables <- c(dropTables, targetTable) } - sql <- c(sql, SqlRender::readSql(system.file("sql", "sql_server", "subsets", "CohortSubsetDefinition.sql", package = "CohortGenerator"))) + sql <- c(sql, SqlRender::readSql(system.file("sql", "sql_server", "subsets", "CohortSubsetDefinition.sql", utils::packageName()))) # Cleanup after exectuion for (table in dropTables) { sql <- c(sql, SqlRender::render("DROP TABLE IF EXISTS @table;", table = table)) diff --git a/R/SubsetQueryBuilders.R b/R/SubsetQueryBuilders.R index 4edef34..57b8ee9 100644 --- a/R/SubsetQueryBuilders.R +++ b/R/SubsetQueryBuilders.R @@ -49,7 +49,7 @@ CohortSubsetQb <- R6::R6Class( inherit = QueryBuilder, private = list( innerQuery = function(targetTable) { - sql <- SqlRender::readSql(system.file("sql", "sql_server", "subsets", "CohortSubsetOperator.sql", package = "CohortGenerator")) + sql <- SqlRender::readSql(system.file("sql", "sql_server", "subsets", "CohortSubsetOperator.sql", utils::packageName())) sql <- SqlRender::render(sql, target_table = targetTable, output_table = self$getTableObjectId(), @@ -83,7 +83,7 @@ LimitSubsetQb <- R6::R6Class( inherit = QueryBuilder, private = list( innerQuery = function(targetTable) { - sql <- SqlRender::readSql(system.file("sql", "sql_server", "subsets", "LimitSubsetOperator.sql", package = "CohortGenerator")) + sql <- SqlRender::readSql(system.file("sql", "sql_server", "subsets", "LimitSubsetOperator.sql", utils::packageName())) sql <- SqlRender::render(sql, calendar_end_date = ifelse(is.null(private$operator$calendarEndDate), yes = "0", no = "1"), calendar_end_date_day = ifelse(is.null(private$operator$calendarEndDate), yes = "", no = lubridate::day(private$operator$calendarEndDate)), @@ -111,7 +111,7 @@ DemographicSubsetQb <- R6::R6Class( inherit = QueryBuilder, private = list( innerQuery = function(targetTable) { - sql <- SqlRender::readSql(system.file("sql", "sql_server", "subsets", "DemographicSubsetOperator.sql", package = "CohortGenerator")) + sql <- SqlRender::readSql(system.file("sql", "sql_server", "subsets", "DemographicSubsetOperator.sql", utils::packageName())) sql <- SqlRender::render(sql, target_table = targetTable, output_table = self$getTableObjectId(), diff --git a/inst/cohortDefinitionSetSpecificationDescription.csv b/inst/csv/cohortDefinitionSetSpecificationDescription.csv similarity index 100% rename from inst/cohortDefinitionSetSpecificationDescription.csv rename to inst/csv/cohortDefinitionSetSpecificationDescription.csv diff --git a/inst/negativeControlOutcomeCohortSetSpecificationDescription.csv b/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv similarity index 100% rename from inst/negativeControlOutcomeCohortSetSpecificationDescription.csv rename to inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv diff --git a/inst/csv/resultsDataModelSpecification.csv b/inst/csv/resultsDataModelSpecification.csv new file mode 100644 index 0000000..ccfd025 --- /dev/null +++ b/inst/csv/resultsDataModelSpecification.csv @@ -0,0 +1,46 @@ +table_name,column_name,data_type,is_required,primary_key,min_cell_count,description +cohort_definition,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_definition,cohort_name,varchar,Yes,No,No,The name of the cohort definition +cohort_definition,description,varchar,No,No,No,A description of the cohort definition +cohort_definition,json,text,No,No,No,The circe-be compiliant JSON expression +cohort_definition,sql_command,text,No,No,No,The OHDSI-SQL command used to construct the cohort +cohort_definition,subset_parent,bigint,No,No,No,The parent cohort id if this cohort is a subset +cohort_definition,is_subset,int,No,No,No,This value is 1 when the cohort is a subset +cohort_definition,subset_definition_id,bigint,No,No,No,The cohort subset definition +cohort_generation,cohort_id,bigint,Yes,Yes,No,The uniqe identifier for the cohort definition +cohort_generation,cohort_name,varchar,Yes,No,No,The name of the cohort generated +cohort_generation,generation_status,varchar,No,No,No,The cohort generation status +cohort_generation,start_time,Timestamp,No,No,No,The start time of the generation process +cohort_generation,end_time,Timestamp,No,No,No,The end time of the generation process +cohort_generation,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cohort_inclusion,cohort_definition_id,bigint,bigint,Yes,No,The unique identifier for the cohort definition +cohort_inclusion,rule_sequence,int,Yes,Yes,No,The rule sequence for the inclusion rule +cohort_inclusion,name,varchar,Yes,Yes,No,The name of the inclusion rule +cohort_inclusion,description,varchar,No,No,No,The description of the inclusion rule +cohort_inc_result,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cohort_inc_result,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_inc_result,inclusion_rule_mask,int,Yes,Yes,No,A bit-mask for the inclusion rule +cohort_inc_result,person_count,bigint,Yes,Yes,Yes,The number of persons satisifying the inclusion rule +cohort_inc_result,mode_id,int,Yes,Yes,No,The mode of the inclusion rule. +cohort_inc_stats,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cohort_inc_stats,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_inc_stats,rule_sequence,int,Yes,Yes,No,The rule sequence +cohort_inc_stats,person_count,bigint,Yes,Yes,Yes,The person count +cohort_inc_stats,gain_count,bigint,Yes,Yes,No,The gain count +cohort_inc_stats,person_total,bigint,Yes,Yes,Yes,The person total +cohort_inc_stats,mode_id,int,Yes,Yes,No,The mode id +cohort_summary_stats,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cohort_summary_stats,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_summary_stats,base_count,bigint,Yes,Yes,Yes,The base count +cohort_summary_stats,final_count,bigint,Yes,Yes,Yes,The final count +cohort_summary_stats,mode_id,int,Yes,Yes,No,The mode id +cohort_censor_stats,cohort_definition_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_censor_stats,lost_count,bigint,Yes,Yes,Yes,The number lost due to censoring +cohort_count,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cohort_count,cohort_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_count,cohort_entries,bigint,Yes,Yes,Yes,The number of cohort entries +cohort_count,cohort_subjects,bigint,Yes,Yes,Yes,The number of unique subjects +cohort_count_neg_ctrl,database_id,varchar,Yes,Yes,No,The database idenifier for this information +cohort_count_neg_ctrl,cohort_id,bigint,Yes,Yes,No,The unique identifier for the cohort definition +cohort_count_neg_ctrl,cohort_entries,bigint,Yes,Yes,Yes,The number of cohort entries +cohort_count_neg_ctrl,cohort_subjects,bigint,Yes,Yes,Yes,The number of unique subjects diff --git a/inst/sql/sql_server/CreateResultsDataModel.sql b/inst/sql/sql_server/CreateResultsDataModel.sql new file mode 100644 index 0000000..14a5b75 --- /dev/null +++ b/inst/sql/sql_server/CreateResultsDataModel.sql @@ -0,0 +1,91 @@ +{DEFAULT @table_prefix = ''} +{DEFAULT @cohort_definition = cohort_definition} +{DEFAULT @cohort_generation = cohort_generation} +{DEFAULT @cohort_inclusion = cohort_inclusion} +{DEFAULT @cohort_inc_result = cohort_inc_result} +{DEFAULT @cohort_inc_stats = cohort_inc_stats} +{DEFAULT @cohort_summary_stats = cohort_summary_stats} +{DEFAULT @cohort_censor_stats = cohort_censor_stats} +{DEFAULT @cohort_count = cohort_count} +{DEFAULT @cohort_count_neg_ctrl = cohort_count_neg_ctrl} + +CREATE TABLE @database_schema.@table_prefix@cohort_definition ( + cohort_definition_id BIGINT NOT NULL, + cohort_name VARCHAR, + description VARCHAR, + json TEXT, + sql_command TEXT, + subset_parent BIGINT, + is_subset INT, + subset_definition_id BIGINT, + PRIMARY KEY(cohort_definition_id) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_generation ( + cohort_id BIGINT NOT NULL, + cohort_name VARCHAR, + generation_status VARCHAR, + start_time TIMESTAMP, + end_time TIMESTAMP, + database_id VARCHAR NOT NULL, + PRIMARY KEY(cohort_id,database_id) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_inclusion ( + cohort_definition_id BIGINT NOT NULL, + rule_sequence INT NOT NULL, + name VARCHAR NOT NULL, + description VARCHAR, + PRIMARY KEY(cohort_definition_id,rule_sequence,name) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_inc_result ( + database_id VARCHAR NOT NULL, + cohort_definition_id BIGINT NOT NULL, + inclusion_rule_mask INT NOT NULL, + person_count BIGINT NOT NULL, + mode_id INT NOT NULL, + PRIMARY KEY(database_id,cohort_definition_id,inclusion_rule_mask,person_count,mode_id) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_inc_stats ( + database_id VARCHAR NOT NULL, + cohort_definition_id BIGINT NOT NULL, + rule_sequence INT NOT NULL, + person_count BIGINT NOT NULL, + gain_count BIGINT NOT NULL, + person_total BIGINT NOT NULL, + mode_id INT NOT NULL, + PRIMARY KEY(database_id,cohort_definition_id,rule_sequence,person_count,gain_count,person_total,mode_id) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_summary_stats ( + database_id VARCHAR NOT NULL, + cohort_definition_id BIGINT NOT NULL, + base_count BIGINT NOT NULL, + final_count BIGINT NOT NULL, + mode_id INT NOT NULL, + PRIMARY KEY(database_id,cohort_definition_id,base_count,final_count,mode_id) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_censor_stats ( + cohort_definition_id BIGINT NOT NULL, + lost_count BIGINT NOT NULL, + PRIMARY KEY(cohort_definition_id,lost_count) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_count ( + database_id VARCHAR NOT NULL, + cohort_id BIGINT NOT NULL, + cohort_entries BIGINT NOT NULL, + cohort_subjects BIGINT NOT NULL, + PRIMARY KEY(database_id,cohort_id,cohort_entries,cohort_subjects) +); + +CREATE TABLE @database_schema.@table_prefix@cohort_count_neg_ctrl ( + database_id VARCHAR NOT NULL, + cohort_id BIGINT NOT NULL, + cohort_entries BIGINT NOT NULL, + cohort_subjects BIGINT NOT NULL, + PRIMARY KEY(database_id,cohort_id,cohort_entries,cohort_subjects) +); diff --git a/man/createResultsDataModel.Rd b/man/createResultsDataModel.Rd new file mode 100644 index 0000000..105eb23 --- /dev/null +++ b/man/createResultsDataModel.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{createResultsDataModel} +\alias{createResultsDataModel} +\title{Create the results data model tables on a database server.} +\usage{ +createResultsDataModel( + connectionDetails = NULL, + databaseSchema, + tablePrefix = "" +) +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connectionDetails instance @seealso[DatabaseConnector::createConnectionDetails]} + +\item{databaseSchema}{The schema on the server where the tables will be created.} + +\item{tablePrefix}{(Optional) string to insert before table names for database table names} +} +\description{ +Create the results data model tables on a database server. +} +\details{ +Only PostgreSQL and SQLite servers are supported. +} diff --git a/man/getDataMigrator.Rd b/man/getDataMigrator.Rd new file mode 100644 index 0000000..0ddbe7b --- /dev/null +++ b/man/getDataMigrator.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{getDataMigrator} +\alias{getDataMigrator} +\title{Get database migrations instance} +\usage{ +getDataMigrator(connectionDetails, databaseSchema, tablePrefix = "") +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connection details object} + +\item{databaseSchema}{String schema where database schema lives} + +\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cd_")} +} +\value{ +Instance of ResultModelManager::DataMigrationManager that has interface for converting existing data models +} +\description{ +Returns ResultModelManager DataMigrationsManager instance. +} diff --git a/man/getResultsDataModelSpecifications.Rd b/man/getResultsDataModelSpecifications.Rd new file mode 100644 index 0000000..b528db2 --- /dev/null +++ b/man/getResultsDataModelSpecifications.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{getResultsDataModelSpecifications} +\alias{getResultsDataModelSpecifications} +\title{Get specifications for CohortGenerator results data model} +\usage{ +getResultsDataModelSpecifications() +} +\value{ +A tibble data frame object with specifications +} +\description{ +Get specifications for CohortGenerator results data model +} diff --git a/man/migrateDataModel.Rd b/man/migrateDataModel.Rd new file mode 100644 index 0000000..a764238 --- /dev/null +++ b/man/migrateDataModel.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{migrateDataModel} +\alias{migrateDataModel} +\title{Migrate Data model} +\usage{ +migrateDataModel(connectionDetails, databaseSchema, tablePrefix = "") +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connection details object} + +\item{databaseSchema}{String schema where database schema lives} + +\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cd_")} +} +\description{ +Migrate data from current state to next state + +It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +are using a postgres backend) or have kept the csv/zip files from your data generation. +} diff --git a/man/uploadResults.Rd b/man/uploadResults.Rd new file mode 100644 index 0000000..4f68dee --- /dev/null +++ b/man/uploadResults.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResultsDataModel.R +\name{uploadResults} +\alias{uploadResults} +\title{Upload results to the database server.} +\usage{ +uploadResults( + connectionDetails, + schema, + zipFileName, + forceOverWriteOfSpecifications = FALSE, + purgeSiteDataBeforeUploading = TRUE, + tempFolder = tempdir(), + tablePrefix = "", + ... +) +} +\arguments{ +\item{connectionDetails}{An object of type \code{connectionDetails} as created using the +\code{\link[DatabaseConnector]{createConnectionDetails}} function in the +DatabaseConnector package.} + +\item{schema}{The schema on the server where the tables have been created.} + +\item{zipFileName}{The name of the zip file.} + +\item{forceOverWriteOfSpecifications}{If TRUE, specifications of the phenotypes, cohort definitions, and analysis +will be overwritten if they already exist on the database. Only use this if these specifications +have changed since the last upload.} + +\item{purgeSiteDataBeforeUploading}{If TRUE, before inserting data for a specific databaseId all the data for +that site will be dropped. This assumes the input zip file contains the full data for that +data site.} + +\item{tempFolder}{A folder on the local file system where the zip files are extracted to. Will be cleaned +up when the function is finished. Can be used to specify a temp folder on a drive that +has sufficient space if the default system temp space is too limited.} + +\item{tablePrefix}{(Optional) string to insert before table names for database table names} + +\item{...}{See ResultModelManager::uploadResults} +} +\description{ +Requires the results data model tables have been created using the \code{\link{createResultsDataModel}} function. +} diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 1e171bd..6bc578a 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -35,7 +35,7 @@ getCohortsForTest <- function(cohorts, generateStats = FALSE) { } # This will gather all of the cohort JSON in the package for use in the tests -cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", package = "CohortGenerator"), full.names = TRUE) +cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", utils::packageName()), full.names = TRUE) cohorts <- setNames(data.frame(matrix(ncol = 5, nrow = 0), stringsAsFactors = FALSE), c("atlasId", "cohortId", "cohortName", "json", "cohortJsonFile")) for (i in 1:length(cohortJsonFiles)) { cohortJsonFileName <- cohortJsonFiles[i] diff --git a/tests/testthat/test-CohortDefinitionSet.R b/tests/testthat/test-CohortDefinitionSet.R index 6f48618..e3dd5f4 100644 --- a/tests/testthat/test-CohortDefinitionSet.R +++ b/tests/testthat/test-CohortDefinitionSet.R @@ -163,7 +163,7 @@ test_that("Call saveCohortDefinitionSet with missing json", { # Fill the cohort set using cohorts included in this # package as an example - cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", package = "CohortGenerator"), full.names = TRUE) + cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", utils::packageName()), full.names = TRUE) for (i in 1:length(cohortJsonFiles)) { cohortJsonFileName <- cohortJsonFiles[i] cohortName <- tools::file_path_sans_ext(basename(cohortJsonFileName)) diff --git a/tests/testthat/test-ResultsDataModel.R b/tests/testthat/test-ResultsDataModel.R new file mode 100644 index 0000000..159093f --- /dev/null +++ b/tests/testthat/test-ResultsDataModel.R @@ -0,0 +1,126 @@ +library(CohortGenerator) +library(testthat) + +if (dir.exists(Sys.getenv("DATABASECONNECTOR_JAR_FOLDER"))) { + jdbcDriverFolder <- Sys.getenv("DATABASECONNECTOR_JAR_FOLDER") +} else { + jdbcDriverFolder <- "~/.jdbcDrivers" + dir.create(jdbcDriverFolder, showWarnings = FALSE) + DatabaseConnector::downloadJdbcDrivers("postgresql", pathToDriver = jdbcDriverFolder) + withr::defer( + { + unlink(jdbcDriverFolder, recursive = TRUE, force = TRUE) + }, + testthat::teardown_env() + ) +} + +postgresConnectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "postgresql", + user = Sys.getenv("CDM5_POSTGRESQL_USER"), + password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")), + server = Sys.getenv("CDM5_POSTGRESQL_SERVER"), + pathToDriver = jdbcDriverFolder +) + +postgresResultsDatabaseSchema <- paste0("r", Sys.getpid(), format(Sys.time(), "%s"), sample(1:100, 1)) + +databaseFile <- tempfile(fileext = ".sqlite") +sqliteConnectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "sqlite", + server = databaseFile +) +sqliteResultsDatabaseSchema <- "main" + +withr::defer({ + connection <- DatabaseConnector::connect(connectionDetails = postgresConnectionDetails) + sql <- "DROP SCHEMA IF EXISTS @resultsDatabaseSchema CASCADE;" + DatabaseConnector::renderTranslateExecuteSql( + sql = sql, + resultsDatabaseSchema = postgresResultsDatabaseSchema, + connection = connection + ) + + DatabaseConnector::disconnect(connection) + unlink(databaseFile, force = TRUE) +}, +testthat::teardown_env() +) + +testCreateSchema <- function(connectionDetails, resultsDatabaseSchema) { + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + if (connectionDetails$dbms != "sqlite") { + sql <- "CREATE SCHEMA @resultsDatabaseSchema;" + DatabaseConnector::renderTranslateExecuteSql( + sql = sql, + resultsDatabaseSchema = resultsDatabaseSchema, + connection = connection + ) + } + suppressWarnings( + createResultsDataModel( + connectionDetails = connectionDetails, + databaseSchema = resultsDatabaseSchema, + tablePrefix = "" + ) + ) + specifications <- getResultsDataModelSpecifications() + for (tableName in unique(specifications$tableName)) { + expect_true(DatabaseConnector::existsTable(connection = connection, + databaseSchema = resultsDatabaseSchema, + tableName = tableName)) + } + # Bad schema name + expect_error(createResultsDataModel( + connectionDetails = connectionDetails, + databaseSchema = "non_existant_schema" + )) +} + +test_that("Create schema", { + testCreateSchema(connectionDetails = postgresConnectionDetails, + resultsDatabaseSchema = postgresResultsDatabaseSchema) + testCreateSchema(connectionDetails = sqliteConnectionDetails, + resultsDatabaseSchema = sqliteResultsDatabaseSchema) +}) + +testUploadResults <- function(connectionDetails, resultsDatabaseSchema) { + uploadResults( + connectionDetails = connectionDetails, + schema = resultsDatabaseSchema, + zipFileName = system.file("Results_Eunomia.zip", package = "SelfControlledCaseSeries"), + purgeSiteDataBeforeUploading = FALSE) + + # Check if there's data: + connection <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + + specifications <- getResultsDataModelSpecifications() + for (tableName in unique(specifications$tableName)) { + primaryKey <- specifications %>% + dplyr::filter(tableName == !!tableName & + primaryKey == "Yes") %>% + dplyr::select(columnName) %>% + dplyr::pull() + + if ("database_id" %in% primaryKey) { + sql <- "SELECT COUNT(*) FROM @database_schema.@table_name WHERE database_id = '@database_id';" + databaseIdCount <- DatabaseConnector::renderTranslateQuerySql( + connection = connection, + sql = sql, + database_schema = resultsDatabaseSchema, + table_name = tableName, + database_id = "Eunomia" + )[, 1] + expect_true(databaseIdCount >= 0) + } + } +} + +test_that("Results upload", { + testUploadResults(connectionDetails = postgresConnectionDetails, + resultsDatabaseSchema = postgresResultsDatabaseSchema) + testUploadResults(connectionDetails = sqliteConnectionDetails, + resultsDatabaseSchema = sqliteResultsDatabaseSchema) +}) \ No newline at end of file diff --git a/tests/testthat/test-Subsets.R b/tests/testthat/test-Subsets.R index 446ade2..078ce45 100644 --- a/tests/testthat/test-Subsets.R +++ b/tests/testthat/test-Subsets.R @@ -292,7 +292,7 @@ test_that("subset generation", { test_that("Subset definition creation and retrieval with definitionId != 1", { sampleCohorts <- CohortGenerator::createEmptyCohortDefinitionSet() - cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", package = "CohortGenerator"), full.names = TRUE) + cohortJsonFiles <- list.files(path = system.file("testdata/name/cohorts", utils::packageName()), full.names = TRUE) cohortJsonFileName <- cohortJsonFiles[1] cohortName <- tools::file_path_sans_ext(basename(cohortJsonFileName)) cohortJson <- readChar(cohortJsonFileName, file.info(cohortJsonFileName)$size)