SingleR (二) 使用参考数据集预测分群

第一步 构建 21.09.20.SingleR.R

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# 0、加载程辑包
library(Seurat)
library(dplyr)
library(patchwork)

# 加载 SingleR
library(SingleR)
library(SummarizedExperiment)
library(scater)
library(BiocParallel) # 并行计算加速

# 配置数据路径
root_path = "~/zlliu/R_data/21.07.15.IntegrateData"

# 配置结果保存路径
output_path = "~/zlliu/R_output/21.09.20.SingleR"
if (!file.exists(output_path)){dir.create(output_path)}

# 设置工作目录,输出文件将保存在此目录下
setwd(output_path)
getwd()

# 1、读取数据和参考数据集
scRNAs = readRDS(file.path(root_path, 'hBLA_scRNAs.rds'))
if (!("hM1.se" %in% ls())){
hM1.se <- readRDS("/home/rqzhang/zlliu/R_data/human_M1_10x/hM1.se.rds")
}
if (!("hmca" %in% ls())){
hmca <- readRDS("/home/rqzhang/zlliu/R_data/Human_Multiple_Cortical_Areas_SMART-seq/hmca.rds")
}

# 1.1、裁剪数据集
f_noNull <- function(lc_se, lc_col){
if(any(lc_se$meta["sample_name"] != colnames(lc_se))){
return
}
lc_idx = which(lc_se$meta[lc_col] == "")
lc_res = lc_se[, -lc_idx]
lc_res$meta = lc_se$meta[-lc_idx,]
lc_res
}

hmca <- f_noNull(hmca, "subclass_label")

# 2、构造预测方法
f_get_pred <- function(scRNA, lc_i){

# 如果已经计算过了,就不再重复计算,直接返回上一次的结果
if(file.exists(sprintf( "%02d_hM1_subclass.txt", lc_i))){
result_main_hpca <- read.table(sprintf( "%02d_hM1_subclass.txt", lc_i), stringsAsFactors = F)
return (result_main_hpca)
}

# 导出原始counts矩阵
test.count=as.data.frame(scRNA[["RNA"]]@counts)

# 保留共同基因
common_hpca <- intersect(rownames(test.count), c(rownames(hM1.se), rownames(hmca)))
test.count_forhpca <- test.count[common_hpca,]
tp_idx = na.omit(match(common_hpca, rownames(hM1.se)))
lc_hM1.se <- hM1.se[rownames(hM1.se)[tp_idx], ]
tp_idx = na.omit(match(common_hpca, rownames(hmca)))
lc_hmca <- hmca[rownames(hmca)[tp_idx], ]

gc()

# 生成test数据集
test.count_forhpca.se <- SummarizedExperiment(assays=list(counts=test.count_forhpca))
test.count_forhpca.se <- logNormCounts(test.count_forhpca.se)

# 进行分类预测 大概一小时
pred.main.hpca <- SingleR(test = test.count_forhpca.se, ref = list(m1=lc_hM1.se, mca=lc_hmca), labels = list(hM1.se$meta$subclass_label, hmca$meta$subclass_label), BPPARAM=MulticoreParam(32)) # 32CPUs

gc()

#构造返回结果
result_main_hpca <- as.data.frame(pred.main.hpca$labels)
result_main_hpca$CB <- rownames(pred.main.hpca)
colnames(result_main_hpca) <- c('hM1_subclass', 'CB')
write.table(result_main_hpca, sprintf( "%02d_hM1_subclass.txt", lc_i)) #保存下来,方便以后调用
result_main_hpca
}

# 3、进行预测
tp_sc <- scRNAs
# rm(x10s)
gc()
tp_sc_st <- 1
tp_sc_len <- length(tp_sc)
for (lc_i in tp_sc_st:tp_sc_len) {
f_get_pred(tp_sc[[lc_i]], lc_i)
}

# 4、读取预测信息的函数
f_set_pred <- function(scRNA, result_main_hpca){
scRNA@meta.data$CB <- rownames(scRNA@meta.data)
# print(scRNA@meta.data$CB)
scRNA@meta.data=merge(scRNA@meta.data,result_main_hpca,by="CB")
# print(result_main_hpca$CB)
rownames(scRNA@meta.data)=scRNA@meta.data$CB
scRNA
}

for (lc_i in tp_sc_st:tp_sc_len) {
tp_sc[[lc_i]] <- f_set_pred(tp_sc[[lc_i]], f_get_pred(tp_sc[[lc_i]], lc_i))
}

# 5、查找marker
f_findMarkers <- function(lc_scRNA, lc_group.by){
Idents(lc_scRNA) <- lc_scRNA@meta.data[,lc_group.by]
lc_markers <- FindAllMarkers(lc_scRNA, only.pos = TRUE, group.by = lc_group.by, min.pct = 0.25, logfc.threshold = 0.25)
lc_markers <- subset(lc_markers, subset = p_val_adj<0.05)
lc_markers
}

tp_markers <- list()
for (lc_i in 1:length(tp_sc)) {
tp_markers[[lc_i]] <- f_findMarkers(tp_sc[[lc_i]], 'hM1_subclass')
}

gl_MarkerGene <- read.csv(file.path("~/zlliu/R_data/hBLA", "cellType_Markers.csv"), header = F)

f_matchKownMarker <- function(lc_markers_all, lc_markers_kown){
lc_CellType <- lc_markers_kown[match(rownames(lc_markers_all), lc_markers_kown[,1]),2]
cbind(lc_markers_all, lc_CellType)
}

for (lc_i in 1:length(tp_sc)) {
write.csv(f_matchKownMarker(tp_markers[[lc_i]], gl_MarkerGene), sprintf( "%02d_significant_markers_%s.csv", lc_i, names(tp_sc)[lc_i]))
}

gl_MarkerGene <- read.csv("~/zlliu/R_data/hBLA/TableS3.csv", header = T)

第二步 构建 21.09.20.SingleR.pbs 并提交

1
2
3
4
5
6
7
8
9
10
#!/bin/bash
#PBS -q fat
#PBS -V
#PBS -o /home/rqzhang/zlliu/PBS/SingleR/SingleR.out
#PBS -e /home/rqzhang/zlliu/PBS/SingleR/SingleR.err
#PBS -l nodes=1:ppn=32
#PBS -r y
cd /home/rqzhang
Rscript /home/rqzhang/zlliu/PBS/SingleR/21.07.15.10x.hM1.se.R
echo $HOME
  • 提交job:qsub /home/rqzhang/zlliu/PBS/SingleR/21.09.20.SingleR.pbs
  • 查看所有任务:qstat

SingleR (二) 使用参考数据集预测分群
https://b.limour.top/678.html
Author
Limour
Posted on
September 20, 2021
Licensed under