Sometimes when we see an "unexpectied" cluster or group, one of the possible causes could be doublets. Treating doublets as single cells in downstream analyses can bias a study’s conclusions. Here we can use scds, which implamented two new approaches to identify doublets(Co-expression based doublet scoring cxds and binary classification based doublet scoring bcds) to elaluate doublets scores.

For more guides, check scds's page.

set.seed(8887)
library(scds, verbose = FALSE)
library(scater, verbose = FALSE)
library(rsvd, verbose = FALSE)
library(Rtsne, verbose = FALSE)
library(cowplot, verbose = FALSE)
library(scater, verbose = FALSE)
library(patchwork, verbose = FALSE)
library(Seurat, verbose = FALSE)
library(SeuratData, verbose = FALSE)
library(dplyr)

We first load the data (download available here), it's a reference scRNA-seq dataset of ~14,000 adult mouse cortical cell taxonomy from the Allen Institute, generated with the SMART-Seq2 protocol.

allen_reference <- readRDS("~/Documents/blog_notes/allen_cortex.rds")
Convert to SingleCellExperimentobject
brain.ref.sce <- as.SingleCellExperiment(allen_reference)
dim(brain.ref.sce)
<ol class=list-inline>
  • 34617
  • 14249
  • </ol>
    table(brain.ref.sce$subclass)
    
         Astro         CR       Endo    L2/3 IT         L4      L5 IT      L5 PT 
           368          7         94        982       1401        880        544 
         L6 CT      L6 IT        L6b      Lamp5 Macrophage      Meis2         NP 
           960       1872        358       1122         51         45        362 
         Oligo       Peri      Pvalb   Serpinf1        SMC       Sncg        Sst 
            91         32       1337         27         55        125       1741 
           Vip       VLMC 
          1728         67 

    Visualize:

    logcounts(brain.ref.sce) = log1p(counts(brain.ref.sce))
    vrs = apply(logcounts(brain.ref.sce),1,var)
    pc = rpca(t(logcounts(brain.ref.sce)[order(vrs,decreasing=TRUE)[1:100],]))
    ts = Rtsne(pc$x[,1:10],verb=FALSE)
    
    options(repr.plot.width=12, repr.plot.height=8)
    
    reducedDim(brain.ref.sce,"tsne") = ts$Y; rm(ts,vrs,pc)
    
    plotReducedDim(brain.ref.sce,"tsne",col="subclass")
    

    Annotate doublet using co-expression based doublet scoring

    brain.ref.sce = cxds(brain.ref.sce,retRes = TRUE)
    brain.ref.sce = bcds(brain.ref.sce,retRes = TRUE,verb=TRUE)
    
    -> selecting genes
    
    
    -> simulating doublets
    
    
    -> training classifier
    
    
    
    [01:32:23] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
    
    -> done.
    
    
    
    
    brain.ref.sce = cxds_bcds_hybrid(brain.ref.sce)
    plotReducedDim(brain.ref.sce,"tsne",col="hybrid_score")
    
    options(repr.plot.width=20, repr.plot.height=8)
    boxplot(brain.ref.sce$hybrid_score ~ brain.ref.sce$subclass, main="hybrid")
    

    Export scores and load it back into thge original Seurat object:

    metadata = as.data.frame(colData(brain.ref.sce))
    hyb_doublet_score = select(metadata, hybrid_score)
    
    allen_reference <- AddMetaData(allen_reference, metadata = hyb_doublet_score, col.name = 'hybrid.score')
    
    allen_reference_3000 <- allen_reference[,sample(colnames(allen_reference), size =3000, replace=F)]
    
    allen_reference_3000 <- SCTransform(allen_reference_3000, ncells = 3000, verbose = FALSE) %>%
        RunPCA(verbose = FALSE)  %>%
        RunUMAP(dims = 1:30)
    
    DimPlot(allen_reference_3000, group.by = "subclass", label = TRUE)
    
    options(repr.plot.width=10, repr.plot.height=8)
    FeaturePlot(allen_reference_3000, features = "hybrid.score", label = TRUE, pt.size=4)
    

    3 min is a lie :cake:

    Session information

    sessionInfo()
    
    R version 4.1.2 (2021-11-01)
    Platform: x86_64-pc-linux-gnu (64-bit)
    Running under: Debian GNU/Linux 11 (bullseye)
    
    Matrix products: default
    BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
    LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
    
    locale:
     [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
     [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
     [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
     [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
     [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
    
    attached base packages:
    [1] stats4    stats     graphics  grDevices utils     datasets  methods  
    [8] base     
    
    other attached packages:
     [1] dplyr_1.0.8                 stxBrain.SeuratData_0.1.1  
     [3] SeuratData_0.2.1            SeuratObject_4.0.4         
     [5] Seurat_4.1.0                patchwork_1.1.1            
     [7] cowplot_1.1.1               Rtsne_0.15                 
     [9] rsvd_1.0.5                  scater_1.22.0              
    [11] ggplot2_3.3.5               scuttle_1.4.0              
    [13] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
    [15] Biobase_2.54.0              GenomicRanges_1.46.1       
    [17] GenomeInfoDb_1.30.1         IRanges_2.28.0             
    [19] S4Vectors_0.32.3            BiocGenerics_0.40.0        
    [21] MatrixGenerics_1.6.0        matrixStats_0.61.0         
    [23] scds_1.9.1                 
    
    loaded via a namespace (and not attached):
      [1] uuid_1.0-3                plyr_1.8.6               
      [3] igraph_1.2.11             repr_1.1.4               
      [5] lazyeval_0.2.2            splines_4.1.2            
      [7] BiocParallel_1.28.3       listenv_0.8.0            
      [9] scattermore_0.7           digest_0.6.29            
     [11] htmltools_0.5.2           viridis_0.6.2            
     [13] fansi_1.0.2               magrittr_2.0.2           
     [15] ScaledMatrix_1.2.0        tensor_1.5               
     [17] cluster_2.1.2             ROCR_1.0-11              
     [19] globals_0.14.0            spatstat.sparse_2.1-0    
     [21] colorspace_2.0-2          rappdirs_0.3.3           
     [23] ggrepel_0.9.1             crayon_1.4.2             
     [25] RCurl_1.98-1.6            jsonlite_1.7.3           
     [27] spatstat.data_2.1-2       survival_3.2-13          
     [29] zoo_1.8-9                 glue_1.6.1               
     [31] polyclip_1.10-0           gtable_0.3.0             
     [33] zlibbioc_1.40.0           XVector_0.34.0           
     [35] leiden_0.3.9              DelayedArray_0.20.0      
     [37] BiocSingular_1.10.0       future.apply_1.8.1       
     [39] abind_1.4-5               scales_1.1.1             
     [41] DBI_1.1.2                 miniUI_0.1.1.1           
     [43] Rcpp_1.0.8                viridisLite_0.4.0        
     [45] xtable_1.8-4              reticulate_1.24          
     [47] spatstat.core_2.3-2       htmlwidgets_1.5.4        
     [49] httr_1.4.2                RColorBrewer_1.1-2       
     [51] ellipsis_0.3.2            ica_1.0-2                
     [53] farver_2.1.0              pkgconfig_2.0.3          
     [55] uwot_0.1.11               deldir_1.0-6             
     [57] utf8_1.2.2                labeling_0.4.2           
     [59] tidyselect_1.1.1          rlang_1.0.1              
     [61] reshape2_1.4.4            later_1.3.0              
     [63] munsell_0.5.0             tools_4.1.2              
     [65] xgboost_1.5.0.2           cli_3.1.1                
     [67] generics_0.1.2            ggridges_0.5.3           
     [69] evaluate_0.14             stringr_1.4.0            
     [71] fastmap_1.1.0             goftest_1.2-3            
     [73] fitdistrplus_1.1-6        purrr_0.3.4              
     [75] RANN_2.6.1                nlme_3.1-155             
     [77] pbapply_1.5-0             future_1.23.0            
     [79] sparseMatrixStats_1.6.0   mime_0.12                
     [81] compiler_4.1.2            beeswarm_0.4.0           
     [83] plotly_4.10.0             png_0.1-7                
     [85] spatstat.utils_2.3-0      tibble_3.1.6             
     [87] stringi_1.7.6             RSpectra_0.16-0          
     [89] lattice_0.20-45           IRdisplay_1.1            
     [91] Matrix_1.4-0              vctrs_0.3.8              
     [93] pillar_1.7.0              lifecycle_1.0.1          
     [95] spatstat.geom_2.3-1       lmtest_0.9-39            
     [97] RcppAnnoy_0.0.19          BiocNeighbors_1.12.0     
     [99] data.table_1.14.2         bitops_1.0-7             
    [101] irlba_2.3.5               httpuv_1.6.5             
    [103] R6_2.5.1                  promises_1.2.0.1         
    [105] KernSmooth_2.23-20        gridExtra_2.3            
    [107] vipor_0.4.5               parallelly_1.30.0        
    [109] codetools_0.2-18          MASS_7.3-55              
    [111] assertthat_0.2.1          withr_2.4.3              
    [113] sctransform_0.3.3         GenomeInfoDbData_1.2.7   
    [115] mgcv_1.8-38               parallel_4.1.2           
    [117] rpart_4.1.16              grid_4.1.2               
    [119] beachmat_2.10.0           IRkernel_1.3.0.9000      
    [121] tidyr_1.2.0               DelayedMatrixStats_1.16.0
    [123] pbdZMQ_0.3-7              pROC_1.18.0              
    [125] shiny_1.7.1               base64enc_0.1-3          
    [127] ggbeeswarm_0.6.0