setwd("E:\\ebook(2007-8-30)\\Graphical Models(2016-01-11)\\Probabilistic Graphical Models Principles and Techniques\\")

工具的安装

首先执行下面的代码删除以前的安装:

remove.packages("BiocInstaller")

下面的代码可以安装下面这个包’BiocInstaller’

source("https://bioconductor.org/biocLite.R")

然后安装几种常用的从TCGA这下载数据的工具:

biocLite("RTCGAToolbox")


biocLite("RTCGA")


biocLite("RTCGAToolbox")


biocLite("TCGAbiolinksGUI", dependencies = TRUE)

‘RTCGAToolbox’

下面利用这个包:RTCGAToolbox 下载HNSC 数据

首先把这个包RTCGAToolbox加载上:

library("RTCGAToolbox")

先查看一下TCGA上的数据集:

getFirehoseDatasets()
##  [1] "ACC"      "BLCA"     "BRCA"     "CESC"     "CHOL"     "COADREAD"
##  [7] "COAD"     "DLBC"     "ESCA"     "FPPP"     "GBMLGG"   "GBM"     
## [13] "HNSC"     "KICH"     "KIPAN"    "KIRC"     "KIRP"     "LAML"    
## [19] "LGG"      "LIHC"     "LUAD"     "LUSC"     "MESO"     "OV"      
## [25] "PAAD"     "PCPG"     "PRAD"     "READ"     "SARC"     "SKCM"    
## [31] "STAD"     "STES"     "TGCT"     "THCA"     "THYM"     "UCEC"    
## [37] "UCS"      "UVM"

显然我们关注的是HNSC这个。

利用下面代码看一下数据的最后三个更新日期:

getFirehoseAnalyzeDates(last=3)
## [1] "20160128" "20150821" "20150402"

选择第三个日期,加载数据:

hnscData = getFirehoseData (dataset="HNSC", runDate="20150402",forceDownload = TRUE,Clinic=TRUE, Mutation=F)
save(hnscData,file="data.RData")
rm(hnscData)

载入数据

data(hnscData)
## Warning in data(hnscData): data set 'hnscData' not found
hnscd <- getData(hnscData,"Clinical")
## HNSC FirehoseData object
## Available data types:
## Clinical: A data frame, dim: 522 23
## To export data, you may use getData() function.

到此数据加载完成,显示一下前15行数据:

library(knitr)
## Warning: package 'knitr' was built under R version 3.4.1
kable(hnscd[1:15,],digits=3)
Composite Element REF years_to_birth vital_status days_to_death days_to_last_followup primary_site_of_disease neoplasm_diseasestage pathology_T_stage pathology_N_stage pathology_M_stage dcc_upload_date gender date_of_initial_pathologic_diagnosis days_to_last_known_alive radiation_therapy histological_type radiations_radiation_regimenindication number_pack_years_smoked year_of_tobacco_smoking_onset number_of_lymph_nodes race ethnicity batch_number
tcga.4p.aa8j value 66 0 NA 102 head and neck stage iva t2 n2c mx 19-3-2015 male 2013 NA yes head and neck squamous cell carcinoma yes NA NA 6 black or african american not hispanic or latino 403.15.0
tcga.ba.4074 value 69 1 462 462 head and neck stage iva t2 n2c m0 19-3-2015 male 2003 437 NA head and neck squamous cell carcinoma no 51 1951 5 white not hispanic or latino 54.67.0
tcga.ba.4075 value 49 1 283 283 head and neck stage iii t3 n0 m0 19-3-2015 male 2004 278 NA head and neck squamous cell carcinoma yes 30 1974 0 black or african american not hispanic or latino 54.67.0
tcga.ba.4076 value 39 1 415 415 head and neck NA tx nx NA 9-10-2014 male 2003 236 NA head and neck squamous cell carcinoma yes 30 1983 NA white not hispanic or latino 54.58.0
tcga.ba.4077 value 45 1 1134 NA head and neck stage iva t4a n0 m0 19-3-2015 female 2003 NA NA head and neck squamous cell carcinoma yes 30 1983 NA white not hispanic or latino 54.67.0
tcga.ba.4078 value 83 1 276 276 head and neck NA NA NA NA 19-3-2015 male 2003 170 NA head and neck squamous cell carcinoma no 75 1944 4 white not hispanic or latino 54.67.0
tcga.ba.5149 value 47 1 806 248 head and neck stage iva t3 n2c NA 19-3-2015 male 2010 248 NA head and neck squamous cell carcinoma no 60 1980 5 white not hispanic or latino 83.64.0
tcga.ba.5151 value 72 0 NA 722 head and neck stage iva t4a n0 NA 19-3-2015 male 2010 190 NA head and neck squamous cell carcinoma no 20 1962 0 white not hispanic or latino 54.67.0
tcga.ba.5152 value 56 0 NA 1288 head and neck stage iva t4a n0 m0 19-3-2015 male 2009 NA NA head and neck squamous cell carcinoma no NA NA 0 white not hispanic or latino 145.62.0
tcga.ba.5153 value 51 1 1762 1762 head and neck NA t2 nx NA 19-3-2015 male 2005 1616 NA head and neck squamous cell carcinoma no NA NA NA white not hispanic or latino 54.67.0
tcga.ba.5555 value 54 0 NA 520 head and neck stage iva t3 n2c NA 19-3-2015 male 2010 186 NA head and neck squamous cell carcinoma yes 62 NA 1 black or african american not hispanic or latino 83.64.0
tcga.ba.5556 value 58 0 NA 725 head and neck stage ii t2 n0 NA 19-3-2015 female 2010 179 NA head and neck squamous cell carcinoma yes 60 1968 0 white not hispanic or latino 83.64.0
tcga.ba.5557 value 41 0 NA 623 head and neck stage iii t1 n1 NA 19-3-2015 female 2010 242 NA head and neck squamous cell carcinoma no NA NA 1 NA hispanic or latino 83.64.0
tcga.ba.5558 value 65 0 NA 1636 head and neck NA tx nx NA 19-3-2015 male 2006 1636 NA head and neck squamous cell carcinoma yes NA NA 0 white not hispanic or latino 83.64.0
tcga.ba.5559 value 71 0 NA 1747 head and neck NA tx nx NA 19-3-2015 male 2006 1747 NA head and neck squamous cell carcinoma yes NA NA 4 white not hispanic or latino 83.64.0
hnscd$years_to_birth
##   [1] "66" "69" "49" "39" "45" "83" "47" "72" "56" "51" "54" "58" "41" "65"
##  [15] "71" "53" "62" "60" "75" "47" "28" "61" "59" "77" "57" "46" "59" "41"
##  [29] "24" "44" "70" "80" "49" "62" "62" "59" "50" "68" "48" "52" "73" "66"
##  [43] "50" "56" "67" "43" "61" "40" "58" "64" "63" "69" "84" "47" "64" "65"
##  [57] "52" "61" "72" "45" "61" "67" "60" "68" "56" "56" "73" "62" "63" "61"
##  [71] "70" "52" "70" "19" "53" "71" "79" "75" "48" "64" "56" "60" "59" "68"
##  [85] "80" "48" "55" "38" "51" "60" "89" "78" "55" "56" "53" "57" "66" "56"
##  [99] "64" "55" "85" "61" "58" "63" "49" "73" "66" "47" "64" "61" "67" "78"
## [113] "58" "66" "53" "63" "61" "60" "60" "71" "67" "60" "50" "59" "48" "46"
## [127] "40" "47" "57" "57" "59" "61" "59" "40" "82" "59" "65" "67" "61" "46"
## [141] "69" "73" "87" "74" "87" "52" "50" "69" "79" "63" "69" "52" "65" "77"
## [155] "71" "61" "59" "83" "40" "75" "80" "77" "76" "51" "63" "88" "56" NA  
## [169] "59" "69" "76" "78" "58" "73" "51" "48" "53" "35" "71" "59" "38" "58"
## [183] "59" "68" "51" "56" "66" "53" "47" "62" "67" "50" "68" "60" "78" "69"
## [197] "66" "60" "52" "54" "59" "72" "45" "45" "66" "67" "83" "58" "78" "58"
## [211] "49" "79" "42" "69" "70" "55" "67" "36" "67" "26" "70" "80" "44" "53"
## [225] "60" "64" "68" "53" "61" "73" "68" "60" "57" "65" "62" "52" "58" "76"
## [239] "63" "64" "63" "59" "60" "62" "50" "66" "53" "26" "50" "57" "62" "60"
## [253] "53" "66" "67" "68" "71" "87" "60" "80" "51" "73" "74" "41" "79" "64"
## [267] "57" "65" "80" "59" "74" "67" "48" "49" "61" "65" "74" "39" "54" "87"
## [281] "53" "85" "66" "80" "76" "49" "61" "82" "64" "34" "53" "67" "77" "69"
## [295] "60" "50" "62" "55" "63" "64" "62" "58" "55" "32" "57" "64" "49" "67"
## [309] "43" "61" "64" "74" "78" "60" "29" "62" "76" "60" "65" "67" "77" "73"
## [323] "47" "55" "56" "79" "49" "64" "57" "77" "87" "38" "66" "48" "57" "82"
## [337] "69" "46" "64" "59" "87" "75" "47" "61" "75" "72" "65" "82" "48" "24"
## [351] "42" "82" "78" "85" "53" "69" "65" "61" "69" "68" "58" "65" "79" "82"
## [365] "77" "53" "47" "82" "69" "59" "50" "61" "73" "64" "55" "62" "52" "59"
## [379] "61" "67" "65" "44" "62" "57" "50" "67" "43" "52" "64" "73" "52" "66"
## [393] "70" "51" "62" "57" "58" "47" "53" "48" "47" "58" "48" "66" "72" "67"
## [407] "62" "54" "51" "47" "75" "70" "73" "75" "54" "50" "35" "65" "57" "60"
## [421] "62" "69" "74" "52" "62" "63" "58" "51" "61" "41" "74" "56" "79" "56"
## [435] "65" "49" "60" "68" "55" "57" "76" "63" "54" "70" "72" "43" "69" "53"
## [449] "55" "41" "52" "30" "85" "55" "60" "53" "74" "53" "68" "51" "54" "49"
## [463] "41" "49" "49" "50" "67" "47" "45" "49" "54" "79" "53" "55" "54" "79"
## [477] "60" "61" "69" "65" "52" "71" "56" "66" "62" "59" "60" "56" "60" "68"
## [491] "69" "60" "82" "58" "73" "53" "49" "82" "56" "51" "59" "62" "54" "67"
## [505] "50" "54" "63" "75" "66" "42" "71" "80" "59" "68" "59" "79" "59" "72"
## [519] "62" "58" "58" "69"
hnscd[1:15,c(2,3)]
##              years_to_birth vital_status
## tcga.4p.aa8j             66            0
## tcga.ba.4074             69            1
## tcga.ba.4075             49            1
## tcga.ba.4076             39            1
## tcga.ba.4077             45            1
## tcga.ba.4078             83            1
## tcga.ba.5149             47            1
## tcga.ba.5151             72            0
## tcga.ba.5152             56            0
## tcga.ba.5153             51            1
## tcga.ba.5555             54            0
## tcga.ba.5556             58            0
## tcga.ba.5557             41            0
## tcga.ba.5558             65            0
## tcga.ba.5559             71            0
num_hn <- as.numeric(hnscd[,2])
summary(num_hn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   19.00   53.00   61.00   60.89   69.00   89.00       1
boxplot(num_hn)

上面的图形显示,522个案例的年龄基本半数集中在50到70之间。

查看缺失数据:

sum(is.na(hnscd[,2]))
## [1] 1

结果显示有一个

which(is.na(hnscd[,2])==1)
## [1] 168

是第168行的数据:

hnscd[168,]
##              Composite Element REF years_to_birth vital_status
## tcga.cq.a4ca                 value           <NA>            0
##              days_to_death days_to_last_followup primary_site_of_disease
## tcga.cq.a4ca          <NA>                  <NA>           head and neck
##              neoplasm_diseasestage pathology_T_stage pathology_N_stage
## tcga.cq.a4ca              stage ii                t2                n0
##              pathology_M_stage dcc_upload_date gender
## tcga.cq.a4ca                m0       19-3-2015   male
##              date_of_initial_pathologic_diagnosis days_to_last_known_alive
## tcga.cq.a4ca                                 <NA>                     <NA>
##              radiation_therapy                     histological_type
## tcga.cq.a4ca               yes head and neck squamous cell carcinoma
##              radiations_radiation_regimenindication
## tcga.cq.a4ca                                    yes
##              number_pack_years_smoked year_of_tobacco_smoking_onset
## tcga.cq.a4ca                       80                          1963
##              number_of_lymph_nodes race ethnicity batch_number
## tcga.cq.a4ca                     0 <NA>      <NA>     265.34.0