cBioPortal的原始数据获取

发布于 2022-07-07  147 次阅读


来源

https://github.com/cBioPortal/datahub/tree/master/public

https://www.cbioportal.org/study/summary?id=prad_su2c_2019

第一步 获取数据

  • ~/dev/xray/xray -c ~/etc/xui2.json &
  • wget -e "https_proxy=http://127.0.0.1:20809" https://github.com/cBioPortal/datahub/raw/master/public/prad_su2c_2019/data_mrna_seq_fpkm_capture.txt -O data_mrna_seq_fpkm_capture.txt
  • wget -e "https_proxy=http://127.0.0.1:20809" https://github.com/cBioPortal/datahub/raw/master/public/prad_su2c_2019/data_clinical_sample.txt -O data_clinical_sample.txt
  • wget -e "https_proxy=http://127.0.0.1:20809" https://github.com/cBioPortal/datahub/raw/master/public/prad_su2c_2019/data_clinical_patient.txt -O data_clinical_patient.txt
d <- read.table('data_mrna_seq_fpkm_capture.txt', header = T, sep = '\t', allowEscapes = T, quote = '')
d
meta <- read.table('data_clinical_sample.txt', header = T, sep = '\t', comment.char = '#')
meta
clinical <- read.table('data_clinical_patient.txt', header = T, sep = '\t', comment.char = '#')
clinical

第二步 获取生存分析的数据

f_rm_duplicated <- function(NameL, reverse=F){
    tmp <- data.frame(table(NameL))
    if(reverse){
        tmp <- tmp$NameL[tmp$Freq > 1]
    }else{
        tmp <- tmp$NameL[tmp$Freq == 1]
    }
    which(NameL %in% as.character(tmp))
}
f_name_dedup <- function(lc_exp, rowN = 1){
    if (rowN == 0){
        res <- lc_exp
        rowNn <- rownames(lc_exp)
    }else{
        res <- lc_exp[-rowN]
        rowNn <- lc_exp[[rowN]]
    }
    noDup <- f_rm_duplicated(rowNn)
    tmp <- rowNn[noDup]
    noDup <- res[noDup,]
    rownames(noDup) <- tmp
    Dup <- f_rm_duplicated(rowNn, T)
    rowNn <- rowNn[Dup]
    Dup <- res[Dup,]
    rownames(Dup) <- NULL
    lc_tmp = by(Dup,
         rowNn,
         function(x) rownames(x)[which.max(rowMeans(x))])
    lc_probes = as.integer(lc_tmp)
    Dup = Dup[lc_probes,]
    rownames(Dup) <- rowNn[lc_probes]
    return(rbind(noDup,Dup))
}
meta <- meta[f_rm_duplicated(meta$PATIENT_ID),]
rownames(meta)<- meta$PATIENT_ID
meta
rownames(clinical) <- clinical$PATIENT_ID
clinical
mergeID <- intersect(rownames(clinical), rownames(meta))
df <- cbind(clinical[mergeID,], meta[mergeID,])
rownames(df) <- df$SAMPLE_ID
df
saveRDS(df, 'meta.rds')
saveRDS(d, 'fpkm.rds')

一枚爱好探索的医学生