When dicovering genes and their function on deseases, survival curve is a good aspect to help us checking the influence of a factor (like genotype, RNA expression level, and age, gender). According to this pubilication here, "the survival curve can be created assuming various situations. It involves computing of probabilities of occurrence of event at a certain point of time and multiplying these successive probabilities by any earlier computed probabilities to get the final estimate."

This note is about how I can make survival curve on a certain gene, and will keep it in update when I get new ideas later :)

Data source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE53624

Public microarray RNA expression data of ESCC were retrieved from GEO database (GSE53624).

# https://cran.r-project.org/web/packages/vroom/readme/README.html

library(AnnoProbe)
library(GEOquery)
Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Setting options('download.file.method.GEOquery'='auto')

Setting options('GEOquery.inmemory.gpl'=FALSE)

setwd("~/Documents/notes/")
dir.create("GSE53624",recursive = T)
eset <- getGEO("GSE53624", destdir = "./GSE53624", getGPL = F)
Found 1 file(s)

GSE53624_series_matrix.txt.gz

expr <- as.data.frame(exprs(eset[[1]]))
head(expr[,1:4])
A data.frame: 6 × 4
GSM1297076GSM1297077GSM1297078GSM1297079
<dbl><dbl><dbl><dbl>
114.19654114.15171413.79694813.802610
2 3.195847 3.042514 3.211573 2.995495
2415.26163715.83073915.31161015.527160
25 3.157660 3.378073 3.165554 3.634285
26 5.277165 5.297271 5.193853 5.467351
27 8.545228 8.327291 8.527834 8.590668

Query probe annotation:

ids=idmap('GPL18109','pipe')
head(ids)
file downloaded in /home/xiaofan/Documents/notes

A data.frame: 6 × 2
probe_idsymbol
<int><chr>
180108DDX11L1
280108WASH7P
3 4320DDX11L1
4 4320WASH7P
597414DDX11L1
697414WASH7P

Find the target gene:

ids[ids$symbol=='CST1',]
A data.frame: 1 × 2
probe_idsymbol
<int><chr>
8029869686CST1
CST1 = expr[as.character(ids[ids$symbol=='CST1',1]),]
CST1
A data.frame: 1 × 238
GSM1297076GSM1297077GSM1297078GSM1297079GSM1297080GSM1297081GSM1297082GSM1297083GSM1297084GSM1297085GSM1297304GSM1297305GSM1297306GSM1297307GSM1297308GSM1297309GSM1297310GSM1297311GSM1297312GSM1297313
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
6968612.098178.92284114.9066310.1860314.9162310.1804113.398669.19937815.4972511.3092614.658629.3313.422819.26877513.3288310.6113911.574938.85152813.125210.69054
CTSB = expr[as.character(ids[ids$symbol=='CTSB',1]),]
CTSB
A data.frame: 1 × 238
GSM1297076GSM1297077GSM1297078GSM1297079GSM1297080GSM1297081GSM1297082GSM1297083GSM1297084GSM1297085GSM1297304GSM1297305GSM1297306GSM1297307GSM1297308GSM1297309GSM1297310GSM1297311GSM1297312GSM1297313
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
3125014.6058213.5912314.594514.1149815.007113.2810414.3299713.9069314.738114.5931914.9071214.3530214.578114.2068413.9245713.7571314.5471914.4542914.8522314.14271

Metadata: GSE53624_clinical_data_of_patients_orignial_set.xls was downloaded from here

clinical <- xlsx::read.xlsx("./GSE53624/GSE53624_clinical_data_of_patients_orignial_set.xlsx",sheetIndex = 1)
table(clinical$Death.at.FU)
 no yes 
 46  73 
clinical$Death.at.FU <- gsub("no","0",
                             gsub("yes","1",clinical$Death.at.FU)) 
clinical_data <- data.frame(OS.time=as.numeric(clinical$Survival.time.months.),
                            OS=as.numeric(clinical$Death.at.FU),
                            sample=clinical$Patient.ID)
head(clinical_data)
A data.frame: 6 × 3
OS.timeOSsample
<dbl><dbl><chr>
148.7666671ec4
2 9.7666671ec6
3 5.8333331ec7
472.5333330ec9
572.6333330ec10
635.0333331ec11
phenotype <- pData(eset[[1]])
phe1 <- data.frame(sample = rownames(phenotype),
                   title = phenotype$title) 
phe1$tissue <- stringr::str_split(phe1$title," ",simplify = T)[,1]
phe1$patient <- stringr::str_split(phe1$title," ",simplify = T)[,5]
head(phe1)
phe1=phe1[phe1$tissue == 'cancer',]
phe1$patient=paste0('ec',phe1$patient)
identical(phe1$patient,clinical_data$sample)
A data.frame: 6 × 4
sampletitletissuepatient
<chr><chr><chr><chr>
1GSM1297076cancer tissue from patient 224cancer224
2GSM1297077normal tissue from patient 224normal224
3GSM1297078cancer tissue from patient 225cancer225
4GSM1297079normal tissue from patient 225normal225
5GSM1297080cancer tissue from patient 226cancer226
6GSM1297081normal tissue from patient 226normal226
FALSE
library(survival)
CST1.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
CST1.cl=CST1[match(phe1$sample,colnames(expr))]
CST1.cl
A data.frame: 1 × 119
GSM1297076GSM1297078GSM1297080GSM1297082GSM1297084GSM1297086GSM1297088GSM1297090GSM1297092GSM1297094GSM1297294GSM1297296GSM1297298GSM1297300GSM1297302GSM1297304GSM1297306GSM1297308GSM1297310GSM1297312
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
6968612.0981714.9066314.9162313.3986615.4972515.3929814.3915115.1147415.2902315.1833812.5049914.3944110.2938613.1536312.3921914.6586213.4228113.3288311.5749313.1252
CST1.cl=as.numeric(CST1.cl)

CST1.clinical_data$gene = ifelse( CST1.cl  > median( CST1.cl ),'high','low')
head(CST1.clinical_data)
A data.frame: 6 × 4
OS.timeOSsamplegene
<dbl><dbl><chr><chr>
6560.300000ec224low
6627.566671ec225high
6734.666671ec226high
6860.966670ec227low
8115.433331ec251high
8261.333330ec253high
sfit1=survfit(Surv(OS.time, OS)~gene, data=CST1.clinical_data)
p1 = survminer::ggsurvplot(sfit1,pval =TRUE, data = CST1.clinical_data, risk.table = TRUE) + ggplot2::labs(title="CST1")
p1
CTSB.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
CTSB.cl=CTSB[match(phe1$sample,colnames(expr))]
CTSB.cl=as.numeric(CTSB.cl)
CTSB.clinical_data$gene = ifelse( CTSB.cl  > median( CTSB.cl ),'high','low')
sfit2=survfit(Surv(OS.time, OS)~gene, data=CTSB.clinical_data)
p2 = survminer::ggsurvplot(sfit2,pval =TRUE, data = CTSB.clinical_data, risk.table = TRUE) + ggplot2::labs(title="CTSB")
p2
gene = "TTF1"
ids[ids$symbol=='TTF1',]
A data.frame: 3 × 2
probe_idsymbol
<int><chr>
44153 49101TTF1
44154119532TTF1
44155103167TTF1
TTF1 = expr[as.character(ids[ids$symbol==gene,1]),]
TTF1
A data.frame: 3 × 238
GSM1297076GSM1297077GSM1297078GSM1297079GSM1297080GSM1297081GSM1297082GSM1297083GSM1297084GSM1297085GSM1297304GSM1297305GSM1297306GSM1297307GSM1297308GSM1297309GSM1297310GSM1297311GSM1297312GSM1297313
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
4910111.0298710.8037410.9158310.6268511.4495710.9120510.8373810.7701610.1981410.1831710.3009710.1599711.3799010.8349811.0314810.6535210.3979110.3928310.33114 9.444365
11953211.7170411.4716611.6142011.2865812.1365011.5925211.5678411.4788611.2651711.2961311.0710610.8578111.8930611.2959811.6049011.2566311.0052610.9778810.9933010.568630
10316710.9323310.7775310.8687410.8431611.3939710.9483210.8388410.8974911.4211911.3420610.1047910.0941211.4242010.7068810.8101110.5575210.3464310.3103610.38547 9.385584
TTF1 = expr[49101,]
TTF1
PTEN=TTF1
A data.frame: 1 × 238
GSM1297076GSM1297077GSM1297078GSM1297079GSM1297080GSM1297081GSM1297082GSM1297083GSM1297084GSM1297085GSM1297304GSM1297305GSM1297306GSM1297307GSM1297308GSM1297309GSM1297310GSM1297311GSM1297312GSM1297313
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
790666.3910437.7984945.0720137.1969616.7980049.2599915.7994676.8667733.9124957.1937347.8674089.9854944.701946.2164634.8158194.5578915.6324516.5491774.0054415.385023
TTF1.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
TTF1.cl=as.numeric(TTF1[match(phe1$sample,colnames(expr))])
TTF1.clinical_data$gene = ifelse( TTF1.cl  > median( TTF1.cl ),'high','low')
sfit3=survfit(Surv(OS.time, OS)~gene, data=TTF1.clinical_data)
survminer::ggsurvplot(sfit3,pval =TRUE, data = TTF1.clinical_data, risk.table = TRUE) + ggplot2::labs(title="TTF1")
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 10 (buster)

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.8.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.8.0

locale:
 [1] LC_CTYPE=en_US.UTF-8          LC_NUMERIC=C                 
 [3] LC_TIME=en_US.UTF-8           LC_COLLATE=en_US.UTF-8       
 [5] LC_MONETARY=en_US.UTF-8       LC_MESSAGES=en_US.UTF-8      
 [7] LC_PAPER=en_US.UTF-8          LC_NAME=en_US.UTF-8          
 [9] LC_ADDRESS=en_US.UTF-8        LC_TELEPHONE=en_US.UTF-8     
[11] LC_MEASUREMENT=en_US.UTF-8    LC_IDENTIFICATION=en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] survival_3.3-1      GEOquery_2.62.2     Biobase_2.54.0     
[4] BiocGenerics_0.40.0 AnnoProbe_0.1.6    

loaded via a namespace (and not attached):
 [1] ggtext_0.1.1       RColorBrewer_1.1-3 repr_1.1.4         tools_4.1.2       
 [5] backports_1.4.1    utf8_1.2.2         R6_2.5.1           DT_0.23           
 [9] DBI_1.1.2          colorspace_2.0-3   tidyselect_1.1.2   gridExtra_2.3     
[13] curl_4.3.2         compiler_4.1.2     cli_3.3.0          Cairo_1.5-15      
[17] xml2_1.3.3         labeling_0.4.2     scales_1.2.0       survMisc_0.5.6    
[21] readr_2.1.2        pbdZMQ_0.3-7       stringr_1.4.0      digest_0.6.29     
[25] R.utils_2.11.0     base64enc_0.1-3    pkgconfig_2.0.3    htmltools_0.5.2   
[29] fastmap_1.1.0      limma_3.50.3       htmlwidgets_1.5.4  rlang_1.0.2       
[33] generics_0.1.2     farver_2.1.0       zoo_1.8-10         jsonlite_1.8.0    
[37] dplyr_1.0.9        xlsx_0.6.5         car_3.0-13         R.oo_1.24.0       
[41] magrittr_2.0.3     Matrix_1.4-1       Rcpp_1.0.8.3       IRkernel_1.3      
[45] munsell_0.5.0      fansi_1.0.3        abind_1.4-5        lifecycle_1.0.1   
[49] R.methodsS3_1.8.1  stringi_1.7.6      carData_3.0-5      grid_4.1.2        
[53] crayon_1.5.1       survminer_0.4.9    lattice_0.20-45    IRdisplay_1.1     
[57] splines_4.1.2      gridtext_0.1.4     xlsxjars_0.6.1     hms_1.1.1         
[61] knitr_1.39         pillar_1.7.0       ggpubr_0.4.0       uuid_1.1-0        
[65] markdown_1.1       ggsignif_0.6.3     glue_1.6.2         evaluate_0.15     
[69] data.table_1.14.2  vctrs_0.4.1        tzdb_0.3.0         gtable_0.3.0      
[73] purrr_0.3.4        tidyr_1.2.0        km.ci_0.5-6        assertthat_0.2.1  
[77] ggplot2_3.3.6      xfun_0.31          xtable_1.8-4       broom_0.8.0       
[81] rstatix_0.7.0      tibble_3.1.7       pheatmap_1.0.12    rJava_1.0-6       
[85] KMsurv_0.1-5       ellipsis_0.3.2