TCGAbiolinks下载甲基化数据

安装补充包

  • conda activate tcga
  • # conda install -c bioconda bioconductor-sesamedata -y
  • conda install -c conda-forge r-magick -y
  • BiocManager::install(“sesameData”)
  • BiocManager::install(“sesame”)
  • BiocManager::install(“doParallel”)
  • BiocManager::install(“ComplexHeatmap”)
  • BiocManager::install(“matlab”)
  • conda install -c bioconda bioconductor-motifstack -y
  • # conda install -c bioconda bioconductor-bsgenome.hsapiens.ucsc.hg38 -y
  • BiocManager::install(“BSgenome.Hsapiens.UCSC.hg38”)
  • BiocManager::install(“BSgenome.Hsapiens.UCSC.hg19”)
  • # conda install -c bioconda bioconductor-genomicranges -y
  • conda install -c bioconda bioconductor-rgadem -y

下载数据

1
2
3
4
5
6
7
8
9
10
library(TCGAbiolinks)
query_met.hg38 <- GDCquery(
project= "TCGA-PRAD",
data.category = "DNA Methylation",
data.type = "Methylation Beta Value",
platform = "Illumina Human Methylation 450"
)
GDCdownload(query_met.hg38)
data.hg38 <- GDCprepare(query_met.hg38)
saveRDS(data.hg38, 'prad.meth.rds')

清洗数据

1
2
3
4
5
6
7
8
9
10
library(SummarizedExperiment)
# 删除包含 NA 值的探针
data.hg38 <- subset(data.hg38,subset = (rowSums(is.na(assay(data.hg38))) == 0))
# 去除重复样本
data.hg38 <- data.hg38[, substr(colnames(data.hg38), 14, 16) == "01A"]
group <- readRDS('../idea_2/fig3.2/fig5/tcga.predict.rds')
gRow <- intersect(data.hg38$patient, rownames(group))
group <- group[gRow,]
data.hg38 <- data.hg38[, substr(colnames(data.hg38), 1, 12) %in% gRow]
data.hg38$group <- group[substr(colnames(data.hg38), 1, 12), 'group']

计算整体差异

1
2
3
4
5
6
7
TCGAvisualize_meanMethylation(
data.hg38,
groupCol = "group",
group.legend = "Groups",
filename = NULL,
print.pvalue = TRUE
)

识别差异甲基化 CpG 位点

1
2
3
4
5
6
7
8
9
10
11
12
13
res <- TCGAanalyze_DMC(
data.hg38,
# colData 函数获取的矩阵中分组列名
groupCol = "group",
group1 = "High Risk",
group2 = "Low Risk",
p.cut = 0.05,
diffmean.cut = 0.15,
save = FALSE,
legend = "State",
plot.filename = "./PRAD_metvolcano.png",
cores = 8
)

绘制热图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
sig_met <- subset(res, status != "Not Significant")
res_data <- subset(data.hg38, subset = (rownames(data.hg38) %in% rownames(sig_met)))
library(circlize)
library(ComplexHeatmap)
group <- group[substr(colnames(data.hg38), 1, 12), ]
#定义注释信息
ha <- HeatmapAnnotation(`Risk group` = group$group,
`Risk Score` = group$Risk_Score,
col = list(
`Risk group` = c("Low Risk" = "#99CCFFAA", 'High Risk' = '#FF6666AA')
),
# annotation_name_gp = gpar(fontsize = 14),
show_annotation_name = TRUE)
heatmap <- Heatmap(
assay(res_data),
name = "DNA methylation",
col = matlab::jet.colors(200),
show_row_names = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
show_column_names = FALSE,
bottom_annotation = ha,
column_title = "DNA Methylation",
column_order = order(group$Risk_Score)
)
pdf("./prad_meth_heatmap.pdf",width = 6, height = 4);{
draw(heatmap, annotation_legend_side = "bottom")
};dev.off()

模体分析(需要hg19)

1
2
3
4
5
6
7
8
9
query_met.hg19 <- GDCquery(
project= "TCGA-PRAD",
data.category = "DNA methylation",
platform = "Illumina Human Methylation 450",
legacy = TRUE # hg19
)
GDCdownload(query_met.hg19)
data.hg19 <- GDCprepare(query_met.hg19)
saveRDS(data.hg38, 'prad.meth.hg19.rds')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
library(rGADEM)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19)
library(motifStack)
probes <- rowRanges(res_data)
# 获取差异的探针并设置 200bp 大小的窗口
sequence <- GRanges(
seqnames = as.character(seqnames(probes)),
ranges = IRanges(start = start(ranges(probes)) - 100,
end = start(ranges(probes)) + 100),
strand = "*"
)
# 识别模体
gadem <- GADEM(sequence, verbose = FALSE, genome = Hsapiens)

因为hg19的数据还在下载,更多内容见官方教程


TCGAbiolinks下载甲基化数据
https://b.limour.top/2010.html
Author
Limour
Posted on
September 8, 2022
Licensed under