add cell type annotation functions

MangiolaLaboratory · Feb 8, 2024 · 6aac48e · 6aac48e
1 parent f0f7bdc
commit 6aac48e
Show file tree

Hide file tree

Showing 8 changed files with 1,168 additions and 18 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(alive_identification)
+export(annotation_consensus)
 export(annotation_label_transfer)
 export(cell_cycle_scoring)
 export(doublet_identification)
@@ -66,6 +67,8 @@ importFrom(celldex,BlueprintEncodeData)
 importFrom(celldex,MonacoImmuneData)
 importFrom(crew,crew_controller_local)
 importFrom(digest,digest)
+importFrom(dplyr,"%>%")
+importFrom(dplyr,case_when)
 importFrom(dplyr,count)
 importFrom(dplyr,filter)
 importFrom(dplyr,if_else)
@@ -94,8 +97,13 @@ importFrom(scater,isOutlier)
 importFrom(scuttle,logNormCounts)
 importFrom(scuttle,perCellQCMetrics)
 importFrom(stats,as.formula)
+importFrom(stringr,str_detect)
 importFrom(stringr,str_remove)
+importFrom(stringr,str_remove_all)
+importFrom(stringr,str_replace)
+importFrom(stringr,str_replace_all)
 importFrom(stringr,str_subset)
+importFrom(stringr,str_trim)
 importFrom(stringr,str_which)
 importFrom(targets,tar_config_get)
 importFrom(targets,tar_option_set)

diff --git a/R/execute_pipeline.R b/R/execute_pipeline.R
@@ -168,21 +168,21 @@ run_targets_pipeline <- function(
       # tarchetypes::tar_files(name= reference_track,
       #                        read_reference_file, 
       #                        deployment = "main"),
-      tar_target(filter_empty_droplets, filtered_file, deployment = "main"),
-      tar_target(tissue, tissue_file, deployment = "main"),
-      tar_target(sample_column, sample_column_file, deployment = "main"),
-      tar_target(reference_label_coarse, reference_label_coarse_id(tissue), deployment = "main"), 
-      tar_target(reference_label_fine, reference_label_fine_id(tissue), deployment = "main"), 
+      tar_target(do_filter_empty_droplets, filtered_file, deployment = "main"),
+      tar_target(tissue_type, tissue_file, deployment = "main"),
+      tar_target(sample_column_name, sample_column_file, deployment = "main"),
+      tar_target(reference_label_coarse, reference_label_coarse_id(tissue_type), deployment = "main"), 
+      tar_target(reference_label_fine, reference_label_fine_id(tissue_type), deployment = "main"), 
       # Reading input files
       tar_target(input_read, readRDS(read_file),
                  pattern = map(read_file),
                  iteration = "list", deployment = "main"),
 
-      tar_target(reference_read, reference_file, deployment = "main"),
+      tar_target(reference_read, switch((!is.null(reference_file)) + 1, NULL, readRDS(reference_file)), deployment = "main"),
 
       # Identifying empty droplets
       tar_target(empty_droplets_tbl,
-                 empty_droplet_id(input_read, filter_empty_droplets),
+                 empty_droplet_id(input_read, do_filter_empty_droplets),
                  pattern = map(input_read),
                  iteration = "list"),
 
@@ -235,7 +235,7 @@ run_targets_pipeline <- function(
                  iteration = "list"),
 
       # Pre-processing output
-      tar_target(preprocessing_output_S, preprocessing_output(tissue,
+      tar_target(preprocessing_output_S, preprocessing_output(tissue_type,
                                                               non_batch_variation_removal_S,
                                                               alive_identification_tbl,
                                                               cell_cycle_score_tbl,
@@ -251,7 +251,7 @@ run_targets_pipeline <- function(
       # pseudobulk preprocessing
       tar_target(pseudobulk_preprocessing_SE, pseudobulk_preprocessing(reference_label_fine,
                                                                        preprocessing_output_S, 
-                                                                       !!sample_column)
+                                                                       sample_column_name)
 
     )))
 

diff --git a/R/functions.R b/R/functions.R
@@ -165,7 +165,7 @@ annotation_label_transfer <- function(input_read_RNA_assay,
 
 
     # Subset
-    RNA_assay = input_read_RNA_assay[[assay]][rownames(input_read_RNA_assay[[assay]]) %in% rownames(reference_azimuth[["SCT"]]),]
+    RNA_assay = input_read_RNA_assay[rownames(input_read_RNA_assay[[assay]]) %in% rownames(reference_azimuth[["SCT"]]),][[assay]]
     #RNA_assay <- input_read_RNA_assay@assays$RNA[["counts"]][rownames(input_read_RNA_assay@assays$RNA[["counts"]])%in% rownames(reference_azimuth[["SCT"]]),]
     #ADT_assay = input_read_RNA_assay[["ADT"]][rownames(input_read_RNA_assay[["ADT"]]) %in% rownames(reference_azimuth[["ADT"]]),]
     input_read_RNA_assay <- CreateSeuratObject( counts = RNA_assay)
@@ -1272,4 +1272,99 @@ map_split_sce_by_gene = function(sce_df, .col, how_many_chunks_base = 10, max_ce
     mutate(sce_md5 = map_chr(!!.col, digest))
 }
 
+#' Harmonize cell type annotations based on consensus
+#'
+#' This function harmonizes cell type annotations by matching them with a reference annotation
+#' and applying specific rules for non-immune cell types.
+#'
+#' @param single_cell_data A data frame containing single-cell data with cell type annotations.
+#' @param .sample_column The column name specifying sample information.
+#' @param .cell_type The column name for the cell type annotations.
+#' @param .azimuth The column name for Azimuth annotations.
+#' @param .blueprint The column name for Blueprint annotations.
+#' @param .monaco The column name for Monaco annotations.
+#'
+#' @return A data frame with harmonized cell type annotations.
+#'
+#' @export
+annotation_consensus = function(single_cell_data, .sample_column, .cell_type, .azimuth, .blueprint, .monaco){
+
+  .sample_column = enquo(.sample_column)
+  .azimuth = enquo(.azimuth)
+  .blueprint = enquo(.blueprint)
+  .monaco = enquo(.monaco)
+  .cell_type = enquo(.cell_type)
+
+  # reference_annotation =
+  #   CuratedAtlasQueryR::get_metadata() |> 
+  #   filter(cell_type_harmonised!="immune_unclassified" | is.na(cell_type_harmonised)) |> 
+  #   select(cell_type,
+  #          cell_type_harmonised, 
+  #          cell_annotation_azimuth_l2, 
+  #          cell_annotation_blueprint_singler, 
+  #          cell_annotation_monaco_singler,
+  #          confidence_class
+  #   ) |> 
+  #   as_tibble() |> 
+  #   mutate(cell_type_clean = cell_type |> clean_cell_types()) |> 
+  #   clean_cell_types_deeper() |> 
+  #   select(-cell_type) |> 
+  #   
+  #   count(cell_type_harmonised, cell_annotation_azimuth_l2, cell_annotation_blueprint_singler, cell_annotation_monaco_singler, confidence_class, cell_type_clean) |>
+  #   with_groups(c(cell_annotation_azimuth_l2, cell_annotation_blueprint_singler, cell_annotation_monaco_singler, cell_type_clean), ~ .x |> arrange(desc(n)) |> slice(1) )
+  # 
+  # reference_annotation |> saveRDS("reference_annotation_16_jan_2024.rds")
+
+  reference_annotation = readRDS("reference_annotation_16_jan_2024.rds")
+
+  annotation=
+    single_cell_data |>
+    rename(
+      .sample := !!.sample_column,
+      cell_type := !!.cell_type,
+      cell_annotation_azimuth_l2 := !!.azimuth,
+      cell_annotation_blueprint_singler := !!.blueprint,
+      cell_annotation_monaco_singler := !!.monaco
+    ) |>
+    select(.cell, .sample, cell_type, cell_annotation_azimuth_l2,cell_annotation_blueprint_singler, cell_annotation_monaco_singler) |> 
+    mutate(across(c(cell_annotation_azimuth_l2, cell_annotation_blueprint_singler, cell_annotation_monaco_singler),	tolower	)) |>
+    mutate(across(c(cell_annotation_azimuth_l2, cell_annotation_blueprint_singler, cell_annotation_monaco_singler),	clean_cell_types	)) |>
+
+    is_strong_evidence(cell_annotation_azimuth_l2, cell_annotation_blueprint_singler) |> 
+
+    # Clen cell types
+    mutate(cell_type_clean = cell_type |> clean_cell_types()) |> 
+    left_join(read_csv("~/PostDoc/CuratedAtlasQueryR/dev/metadata_cell_type.csv"),  by = "cell_type") |> 
+    clean_cell_types_deeper() |>
+
+    # Reference annotation link
+    left_join(reference_annotation ) 
+
+  annotation_connie_non_immune = 
+    annotation |> 
+    filter(cell_type_harmonised |> is.na()) |> 
+
+    harmonise_names_non_immune() |> 
+
+    # Fix some gaps in the original code
+    mutate(cell_type_harmonised = case_when(
+      cell_type |> tolower() |> str_detect("endothelial") ~ "endothelial_cell",
+      cell_type |> tolower() |> str_detect("enodothelial") ~ "endothelial_cell",
+      cell_type |> tolower() |> str_detect("epithelial") ~ "epithelial_cell",
+      cell_type |> tolower() |> str_detect("fibroblast") ~ "fibroblast",
+      TRUE ~ cell_type
+    )) |>
+
+    mutate(confidence_class = 1)
+
+  single_cell_data |> 
+    left_join(
+      annotation |> 
+        filter(!cell_type_harmonised |> is.na()) |> 
+        bind_rows(annotation_connie_non_immune) |>
+        select(.cell, .sample, cell_type_harmonised, confidence_class),
+      by = join_by(.cell, !!.sample_column == .sample)
+    )
+
+}
 
diff --git a/R/targets_functions.R b/R/targets_functions.R
@@ -35,7 +35,8 @@ map2_test_differential_abundance_hpc = function(
     computing_resources = crew_controller_local(workers = 1) , 
     cpus_per_task = 1,
     debug_job_id = NULL,
-    append = FALSE
+    append = FALSE,
+    ...
   ){
 
   .abundance = enquo(.abundance)
@@ -259,7 +260,8 @@ test_differential_abundance_hpc = function(
     store =  tempfile(tmpdir = "."),  
     computing_resources = crew_controller_local(workers = 1) , 
     debug_job_id = NULL, 
-    append = FALSE
+    append = FALSE,
+    ...
   ){
 
   # Create input dataframe
@@ -275,7 +277,8 @@ test_differential_abundance_hpc = function(
       store = store, 
       computing_resources = computing_resources,
       debug_job_id = debug_job_id, 
-      append = append
+      append = append,
+      ...
     )) |> 
     pull(data) |> 
     extract2(1)