When dicovering genes and their function on deseases, survival curve is a good aspect to help us checking the influence of a factor (like genotype, RNA expression level, and age, gender). According to this pubilication here, "the survival curve can be created assuming various situations. It involves computing of probabilities of occurrence of event at a certain point of time and multiplying these successive probabilities by any earlier computed probabilities to get the final estimate."

This note is about how I can make survival curve on a certain gene, and will keep it in update when I get new ideas later :)

Data source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE53624

Public microarray RNA expression data of ESCC were retrieved from GEO database (GSE53624).

# https://cran.r-project.org/web/packages/vroom/readme/README.html

library(AnnoProbe)
library(GEOquery)

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Setting options('download.file.method.GEOquery'='auto')

Setting options('GEOquery.inmemory.gpl'=FALSE)

setwd("~/Documents/notes/")

dir.create("GSE53624",recursive = T)
eset <- getGEO("GSE53624", destdir = "./GSE53624", getGPL = F)

Found 1 file(s)

GSE53624_series_matrix.txt.gz

expr <- as.data.frame(exprs(eset[[1]]))
head(expr[,1:4])

Query probe annotation:

ids=idmap('GPL18109','pipe')
head(ids)

file downloaded in /home/xiaofan/Documents/notes

Find the target gene:

ids[ids$symbol=='CST1',]

CST1 = expr[as.character(ids[ids$symbol=='CST1',1]),]
CST1

CTSB = expr[as.character(ids[ids$symbol=='CTSB',1]),]
CTSB

Metadata: GSE53624_clinical_data_of_patients_orignial_set.xls was downloaded from here

clinical <- xlsx::read.xlsx("./GSE53624/GSE53624_clinical_data_of_patients_orignial_set.xlsx",sheetIndex = 1)

table(clinical$Death.at.FU)

 no yes 
 46  73

clinical$Death.at.FU <- gsub("no","0",
                             gsub("yes","1",clinical$Death.at.FU)) 
clinical_data <- data.frame(OS.time=as.numeric(clinical$Survival.time.months.),
                            OS=as.numeric(clinical$Death.at.FU),
                            sample=clinical$Patient.ID)
head(clinical_data)

phenotype <- pData(eset[[1]])
phe1 <- data.frame(sample = rownames(phenotype),
                   title = phenotype$title) 
phe1$tissue <- stringr::str_split(phe1$title," ",simplify = T)[,1]
phe1$patient <- stringr::str_split(phe1$title," ",simplify = T)[,5]
head(phe1)
phe1=phe1[phe1$tissue == 'cancer',]
phe1$patient=paste0('ec',phe1$patient)
identical(phe1$patient,clinical_data$sample)

library(survival)

CST1.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
CST1.cl=CST1[match(phe1$sample,colnames(expr))]
CST1.cl

CST1.cl=as.numeric(CST1.cl)

CST1.clinical_data$gene = ifelse( CST1.cl  > median( CST1.cl ),'high','low')
head(CST1.clinical_data)

sfit1=survfit(Surv(OS.time, OS)~gene, data=CST1.clinical_data)
p1 = survminer::ggsurvplot(sfit1,pval =TRUE, data = CST1.clinical_data, risk.table = TRUE) + ggplot2::labs(title="CST1")
p1

CTSB.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
CTSB.cl=CTSB[match(phe1$sample,colnames(expr))]
CTSB.cl=as.numeric(CTSB.cl)
CTSB.clinical_data$gene = ifelse( CTSB.cl  > median( CTSB.cl ),'high','low')
sfit2=survfit(Surv(OS.time, OS)~gene, data=CTSB.clinical_data)
p2 = survminer::ggsurvplot(sfit2,pval =TRUE, data = CTSB.clinical_data, risk.table = TRUE) + ggplot2::labs(title="CTSB")
p2

gene = "TTF1"
ids[ids$symbol=='TTF1',]

TTF1 = expr[as.character(ids[ids$symbol==gene,1]),]
TTF1

TTF1 = expr[49101,]
TTF1
PTEN=TTF1

TTF1.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
TTF1.cl=as.numeric(TTF1[match(phe1$sample,colnames(expr))])
TTF1.clinical_data$gene = ifelse( TTF1.cl  > median( TTF1.cl ),'high','low')
sfit3=survfit(Surv(OS.time, OS)~gene, data=TTF1.clinical_data)
survminer::ggsurvplot(sfit3,pval =TRUE, data = TTF1.clinical_data, risk.table = TRUE) + ggplot2::labs(title="TTF1")

sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 10 (buster)

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.8.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.8.0

locale:
 [1] LC_CTYPE=en_US.UTF-8          LC_NUMERIC=C                 
 [3] LC_TIME=en_US.UTF-8           LC_COLLATE=en_US.UTF-8       
 [5] LC_MONETARY=en_US.UTF-8       LC_MESSAGES=en_US.UTF-8      
 [7] LC_PAPER=en_US.UTF-8          LC_NAME=en_US.UTF-8          
 [9] LC_ADDRESS=en_US.UTF-8        LC_TELEPHONE=en_US.UTF-8     
[11] LC_MEASUREMENT=en_US.UTF-8    LC_IDENTIFICATION=en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] survival_3.3-1      GEOquery_2.62.2     Biobase_2.54.0     
[4] BiocGenerics_0.40.0 AnnoProbe_0.1.6    

loaded via a namespace (and not attached):
 [1] ggtext_0.1.1       RColorBrewer_1.1-3 repr_1.1.4         tools_4.1.2       
 [5] backports_1.4.1    utf8_1.2.2         R6_2.5.1           DT_0.23           
 [9] DBI_1.1.2          colorspace_2.0-3   tidyselect_1.1.2   gridExtra_2.3     
[13] curl_4.3.2         compiler_4.1.2     cli_3.3.0          Cairo_1.5-15      
[17] xml2_1.3.3         labeling_0.4.2     scales_1.2.0       survMisc_0.5.6    
[21] readr_2.1.2        pbdZMQ_0.3-7       stringr_1.4.0      digest_0.6.29     
[25] R.utils_2.11.0     base64enc_0.1-3    pkgconfig_2.0.3    htmltools_0.5.2   
[29] fastmap_1.1.0      limma_3.50.3       htmlwidgets_1.5.4  rlang_1.0.2       
[33] generics_0.1.2     farver_2.1.0       zoo_1.8-10         jsonlite_1.8.0    
[37] dplyr_1.0.9        xlsx_0.6.5         car_3.0-13         R.oo_1.24.0       
[41] magrittr_2.0.3     Matrix_1.4-1       Rcpp_1.0.8.3       IRkernel_1.3      
[45] munsell_0.5.0      fansi_1.0.3        abind_1.4-5        lifecycle_1.0.1   
[49] R.methodsS3_1.8.1  stringi_1.7.6      carData_3.0-5      grid_4.1.2        
[53] crayon_1.5.1       survminer_0.4.9    lattice_0.20-45    IRdisplay_1.1     
[57] splines_4.1.2      gridtext_0.1.4     xlsxjars_0.6.1     hms_1.1.1         
[61] knitr_1.39         pillar_1.7.0       ggpubr_0.4.0       uuid_1.1-0        
[65] markdown_1.1       ggsignif_0.6.3     glue_1.6.2         evaluate_0.15     
[69] data.table_1.14.2  vctrs_0.4.1        tzdb_0.3.0         gtable_0.3.0      
[73] purrr_0.3.4        tidyr_1.2.0        km.ci_0.5-6        assertthat_0.2.1  
[77] ggplot2_3.3.6      xfun_0.31          xtable_1.8-4       broom_0.8.0       
[81] rstatix_0.7.0      tibble_3.1.7       pheatmap_1.0.12    rJava_1.0-6       
[85] KMsurv_0.1-5       ellipsis_0.3.2

	GSM1297076	GSM1297077	GSM1297078	GSM1297079
	<dbl>	<dbl>	<dbl>	<dbl>
1	14.196541	14.151714	13.796948	13.802610
2	3.195847	3.042514	3.211573	2.995495
24	15.261637	15.830739	15.311610	15.527160
25	3.157660	3.378073	3.165554	3.634285
26	5.277165	5.297271	5.193853	5.467351
27	8.545228	8.327291	8.527834	8.590668

	probe_id	symbol
	<int>	<chr>
1	80108	DDX11L1
2	80108	WASH7P
3	4320	DDX11L1
4	4320	WASH7P
5	97414	DDX11L1
6	97414	WASH7P

	probe_id	symbol
	<int>	<chr>
80298	69686	CST1

	GSM1297076	GSM1297077	GSM1297078	GSM1297079	GSM1297080	GSM1297081	GSM1297082	GSM1297083	GSM1297084	GSM1297085	⋯	GSM1297304	GSM1297305	GSM1297306	GSM1297307	GSM1297308	GSM1297309	GSM1297310	GSM1297311	GSM1297312	GSM1297313
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
69686	12.09817	8.922841	14.90663	10.18603	14.91623	10.18041	13.39866	9.199378	15.49725	11.30926	⋯	14.65862	9.33	13.42281	9.268775	13.32883	10.61139	11.57493	8.851528	13.1252	10.69054

	GSM1297076	GSM1297077	GSM1297078	GSM1297079	GSM1297080	GSM1297081	GSM1297082	GSM1297083	GSM1297084	GSM1297085	⋯	GSM1297304	GSM1297305	GSM1297306	GSM1297307	GSM1297308	GSM1297309	GSM1297310	GSM1297311	GSM1297312	GSM1297313
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
31250	14.60582	13.59123	14.5945	14.11498	15.0071	13.28104	14.32997	13.90693	14.7381	14.59319	⋯	14.90712	14.35302	14.5781	14.20684	13.92457	13.75713	14.54719	14.45429	14.85223	14.14271

A data.frame: 6 × 3
	OS.time	OS	sample
	<dbl>	<dbl>	<chr>
1	48.766667	1	ec4
2	9.766667	1	ec6
3	5.833333	1	ec7
4	72.533333	0	ec9
5	72.633333	0	ec10
6	35.033333	1	ec11

A data.frame: 6 × 4
	sample	title	tissue	patient
	<chr>	<chr>	<chr>	<chr>
1	GSM1297076	cancer tissue from patient 224	cancer	224
2	GSM1297077	normal tissue from patient 224	normal	224
3	GSM1297078	cancer tissue from patient 225	cancer	225
4	GSM1297079	normal tissue from patient 225	normal	225
5	GSM1297080	cancer tissue from patient 226	cancer	226
6	GSM1297081	normal tissue from patient 226	normal	226

A data.frame: 6 × 4
	OS.time	OS	sample	gene
	<dbl>	<dbl>	<chr>	<chr>
65	60.30000	0	ec224	low
66	27.56667	1	ec225	high
67	34.66667	1	ec226	high
68	60.96667	0	ec227	low
81	15.43333	1	ec251	high
82	61.33333	0	ec253	high

A data.frame: 3 × 238
	GSM1297076	GSM1297077	GSM1297078	GSM1297079	GSM1297080	GSM1297081	GSM1297082	GSM1297083	GSM1297084	GSM1297085	⋯	GSM1297304	GSM1297305	GSM1297306	GSM1297307	GSM1297308	GSM1297309	GSM1297310	GSM1297311	GSM1297312	GSM1297313
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
49101	11.02987	10.80374	10.91583	10.62685	11.44957	10.91205	10.83738	10.77016	10.19814	10.18317	⋯	10.30097	10.15997	11.37990	10.83498	11.03148	10.65352	10.39791	10.39283	10.33114	9.444365
119532	11.71704	11.47166	11.61420	11.28658	12.13650	11.59252	11.56784	11.47886	11.26517	11.29613	⋯	11.07106	10.85781	11.89306	11.29598	11.60490	11.25663	11.00526	10.97788	10.99330	10.568630
103167	10.93233	10.77753	10.86874	10.84316	11.39397	10.94832	10.83884	10.89749	11.42119	11.34206	⋯	10.10479	10.09412	11.42420	10.70688	10.81011	10.55752	10.34643	10.31036	10.38547	9.385584