TCGAbiolinks (三)获取全面的临床数据 

基础数据获取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
library(TCGAbiolinks)
PRAD <- GDCquery(project = 'TCGA-PRAD',
data.category = "Clinical",
file.type = "xml")
GDCdownload(query = PRAD)
? GDCprepare_clinic
clinical.info <- c('admin', 'patient', 'stage_event', 'new_tumor_event') # 不获取'drug', 'follow_up', 'radiation'
clinical.info
f_rm_colN <- function(df, regex){
df[,!grepl(regex, colnames(df))]
}
f_rm_duplicated <- function(NameL, reverse=F){
tmp <- data.frame(table(NameL))
if(reverse){
tmp <- tmp$NameL[tmp$Freq > 1]
}else{
tmp <- tmp$NameL[tmp$Freq == 1]
}
which(NameL %in% as.character(tmp))
}
clinical <- list()
for(info in clinical.info){
clinical[[info]] <- GDCprepare_clinic(PRAD, clinical.info = info)
clinical[[info]] <- f_rm_colN(clinical[[info]], "project")
}
clinical$admin <- f_rm_colN(clinical$admin, "file_uuid")
for(info in clinical.info){
clinical[[info]] <- unique(clinical[[info]])
}
f_merge <- function(lc_mergedList, by, all=T){
Reduce(function(...) merge(..., by=by, all=all), lc_mergedList)
}
clinical <- f_merge(clinical, by = 'bcr_patient_barcode', all = T)
clinical

数据更新补丁

1
2
3
cl_new <- GDCquery_clinic(project = 'TCGA-PRAD', type = 'clinical')
clinical <- merge(clinical, cl_new, by = 'bcr_patient_barcode', all = T, suffixes = c('.old', '.new'))
clinical

生存分析补丁

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
clinical[['os_status']]  <- with(clinical, ifelse(vital_status.new == 'Dead', 0, 1)) # 0表示因病死亡
clinical[['os_time']] <- with(clinical, ifelse(os_status == 0, days_to_death.new, days_to_last_follow_up))
sum(clinical$os_time)
clinical[clinical$patient_death_reason == 'Other, non-malignant disease', 'os_status'] = 1
sum(clinical$os_status)

clinical[['dcf_status']] <- with(clinical, ifelse(new_neoplasm_event_type == '', 0, 1)) # 1表示有新的肿瘤事件
clinical[['dcf_status']] <- with(clinical, ifelse(is.na(dcf_status), 0, dcf_status)) # 1表示有新的肿瘤事件
clinical[clinical$biochemical_recurrence == 'YES', 'dcf_status'] = 1
sum(clinical$dcf_status)

clinical[['tmp_dcf_time']] <- clinical[['days_to_first_biochemical_recurrence']]
clinical[['tmp_dcf_time']] <- with(clinical, ifelse(is.na(tmp_dcf_time), os_time, tmp_dcf_time))
sum(clinical$tmp_dcf_time)
clinical[['dcf_time']] <- with(clinical, ifelse(dcf_status == 1, days_to_new_tumor_event_after_initial_treatment, tmp_dcf_time))
clinical[['dcf_time']] <- with(clinical, ifelse(is.na(dcf_time), tmp_dcf_time, dcf_time))
sum(clinical$dcf_time)
clinical[['dcf_time']] <- with(clinical, ifelse(biochemical_recurrence == 'YES', tmp_dcf_time, dcf_time))
sum(clinical$dcf_time)
clinical <- f_rm_colN(clinical, "tmp_dcf_time")
write.csv(clinical, file = 'TCGA-PRAD_clinical.csv')

sum(clinical$os_status)
sum(clinical$os_time)
sum(clinical$dcf_status)
sum(clinical$dcf_time)

TCGAbiolinks (三)获取全面的临床数据 
https://b.limour.top/1904.html
Author
Limour
Posted on
July 13, 2022
Licensed under