Kaplan-Meier curve of overall survival of cases stratified by the mRNA level of a certain gene in an independent cohort
Only takes 3 min!
When dicovering genes and their function on deseases, survival curve is a good aspect to help us checking the influence of a factor (like genotype, RNA expression level, and age, gender). According to this pubilication here, "the survival curve can be created assuming various situations. It involves computing of probabilities of occurrence of event at a certain point of time and multiplying these successive probabilities by any earlier computed probabilities to get the final estimate."
This note is about how I can make survival curve on a certain gene, and will keep it in update when I get new ideas later :)
Data source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE53624
Public microarray RNA expression data of ESCC were retrieved from GEO database (GSE53624).
# https://cran.r-project.org/web/packages/vroom/readme/README.html
library(AnnoProbe)
library(GEOquery)
setwd("~/Documents/notes/")
dir.create("GSE53624",recursive = T)
eset <- getGEO("GSE53624", destdir = "./GSE53624", getGPL = F)
expr <- as.data.frame(exprs(eset[[1]]))
head(expr[,1:4])
Query probe annotation:
ids=idmap('GPL18109','pipe')
head(ids)
Find the target gene:
ids[ids$symbol=='CST1',]
CST1 = expr[as.character(ids[ids$symbol=='CST1',1]),]
CST1
CTSB = expr[as.character(ids[ids$symbol=='CTSB',1]),]
CTSB
Metadata: GSE53624_clinical_data_of_patients_orignial_set.xls was downloaded from here
clinical <- xlsx::read.xlsx("./GSE53624/GSE53624_clinical_data_of_patients_orignial_set.xlsx",sheetIndex = 1)
table(clinical$Death.at.FU)
clinical$Death.at.FU <- gsub("no","0",
gsub("yes","1",clinical$Death.at.FU))
clinical_data <- data.frame(OS.time=as.numeric(clinical$Survival.time.months.),
OS=as.numeric(clinical$Death.at.FU),
sample=clinical$Patient.ID)
head(clinical_data)
phenotype <- pData(eset[[1]])
phe1 <- data.frame(sample = rownames(phenotype),
title = phenotype$title)
phe1$tissue <- stringr::str_split(phe1$title," ",simplify = T)[,1]
phe1$patient <- stringr::str_split(phe1$title," ",simplify = T)[,5]
head(phe1)
phe1=phe1[phe1$tissue == 'cancer',]
phe1$patient=paste0('ec',phe1$patient)
identical(phe1$patient,clinical_data$sample)
library(survival)
CST1.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
CST1.cl=CST1[match(phe1$sample,colnames(expr))]
CST1.cl
CST1.cl=as.numeric(CST1.cl)
CST1.clinical_data$gene = ifelse( CST1.cl > median( CST1.cl ),'high','low')
head(CST1.clinical_data)
sfit1=survfit(Surv(OS.time, OS)~gene, data=CST1.clinical_data)
p1 = survminer::ggsurvplot(sfit1,pval =TRUE, data = CST1.clinical_data, risk.table = TRUE) + ggplot2::labs(title="CST1")
p1
CTSB.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
CTSB.cl=CTSB[match(phe1$sample,colnames(expr))]
CTSB.cl=as.numeric(CTSB.cl)
CTSB.clinical_data$gene = ifelse( CTSB.cl > median( CTSB.cl ),'high','low')
sfit2=survfit(Surv(OS.time, OS)~gene, data=CTSB.clinical_data)
p2 = survminer::ggsurvplot(sfit2,pval =TRUE, data = CTSB.clinical_data, risk.table = TRUE) + ggplot2::labs(title="CTSB")
p2
gene = "TTF1"
ids[ids$symbol=='TTF1',]
TTF1 = expr[as.character(ids[ids$symbol==gene,1]),]
TTF1
TTF1 = expr[49101,]
TTF1
PTEN=TTF1
TTF1.clinical_data=clinical_data[match(phe1$patient,clinical_data$sample),]
TTF1.cl=as.numeric(TTF1[match(phe1$sample,colnames(expr))])
TTF1.clinical_data$gene = ifelse( TTF1.cl > median( TTF1.cl ),'high','low')
sfit3=survfit(Surv(OS.time, OS)~gene, data=TTF1.clinical_data)
survminer::ggsurvplot(sfit3,pval =TRUE, data = TTF1.clinical_data, risk.table = TRUE) + ggplot2::labs(title="TTF1")
sessionInfo()