TCGAbiolinks (三)获取全面的临床数据 

发布于 2022-07-13  416 次阅读


基础数据获取

library(TCGAbiolinks)
PRAD <- GDCquery(project = 'TCGA-PRAD',
                  data.category = "Clinical",
                  file.type = "xml")
GDCdownload(query = PRAD)
? GDCprepare_clinic
clinical.info <- c('admin', 'patient', 'stage_event', 'new_tumor_event') # 不获取'drug', 'follow_up', 'radiation'
clinical.info
f_rm_colN <- function(df, regex){
    df[,!grepl(regex, colnames(df))]
}
f_rm_duplicated <- function(NameL, reverse=F){
    tmp <- data.frame(table(NameL))
    if(reverse){
        tmp <- tmp$NameL[tmp$Freq > 1]
    }else{
        tmp <- tmp$NameL[tmp$Freq == 1]
    }
    which(NameL %in% as.character(tmp))
}
clinical <- list()
for(info in clinical.info){
    clinical[[info]]  <- GDCprepare_clinic(PRAD, clinical.info = info)
    clinical[[info]] <- f_rm_colN(clinical[[info]], "project")
}
clinical$admin <- f_rm_colN(clinical$admin, "file_uuid")
for(info in clinical.info){
    clinical[[info]] <- unique(clinical[[info]])
}
f_merge  <- function(lc_mergedList, by, all=T){
    Reduce(function(...) merge(..., by=by, all=all), lc_mergedList)
}
clinical <- f_merge(clinical, by = 'bcr_patient_barcode', all = T)
clinical

数据更新补丁

cl_new <- GDCquery_clinic(project = 'TCGA-PRAD', type = 'clinical')
clinical <- merge(clinical, cl_new, by = 'bcr_patient_barcode', all = T, suffixes = c('.old', '.new'))
clinical

生存分析补丁

clinical[['os_status']]  <- with(clinical, ifelse(vital_status.new == 'Dead', 0, 1)) # 0表示因病死亡
clinical[['os_time']] <- with(clinical, ifelse(os_status == 0, days_to_death.new, days_to_last_follow_up))
sum(clinical$os_time)
clinical[clinical$patient_death_reason == 'Other, non-malignant disease', 'os_status'] = 1
sum(clinical$os_status)
 
clinical[['dcf_status']] <- with(clinical, ifelse(new_neoplasm_event_type == '', 0, 1)) # 1表示有新的肿瘤事件
clinical[['dcf_status']] <- with(clinical, ifelse(is.na(dcf_status), 0, dcf_status)) # 1表示有新的肿瘤事件
clinical[clinical$biochemical_recurrence == 'YES', 'dcf_status'] = 1
sum(clinical$dcf_status)
 
clinical[['tmp_dcf_time']] <- clinical[['days_to_first_biochemical_recurrence']]
clinical[['tmp_dcf_time']] <- with(clinical, ifelse(is.na(tmp_dcf_time), os_time, tmp_dcf_time))
sum(clinical$tmp_dcf_time)
clinical[['dcf_time']] <- with(clinical, ifelse(dcf_status == 1, days_to_new_tumor_event_after_initial_treatment, tmp_dcf_time))
clinical[['dcf_time']] <- with(clinical, ifelse(is.na(dcf_time), tmp_dcf_time, dcf_time))
sum(clinical$dcf_time)
clinical[['dcf_time']] <- with(clinical, ifelse(biochemical_recurrence == 'YES', tmp_dcf_time, dcf_time))
sum(clinical$dcf_time)
clinical <- f_rm_colN(clinical, "tmp_dcf_time")
write.csv(clinical, file = 'TCGA-PRAD_clinical.csv')
 
sum(clinical$os_status)
sum(clinical$os_time)
sum(clinical$dcf_status)
sum(clinical$dcf_time)

医学生