SingleR (二) 使用参考数据集预测分群

发布于 2021-09-20  549 次阅读


第一步 构建 21.09.20.SingleR.R

# 0、加载程辑包
library(Seurat)
library(dplyr)
library(patchwork)

# 加载 SingleR
library(SingleR)
library(SummarizedExperiment)
library(scater)
library(BiocParallel) # 并行计算加速

# 配置数据路径
root_path = "~/zlliu/R_data/21.07.15.IntegrateData"

# 配置结果保存路径
output_path = "~/zlliu/R_output/21.09.20.SingleR"
if (!file.exists(output_path)){dir.create(output_path)}

# 设置工作目录,输出文件将保存在此目录下
setwd(output_path)
getwd()

# 1、读取数据和参考数据集
scRNAs = readRDS(file.path(root_path, 'hBLA_scRNAs.rds'))
if (!("hM1.se" %in% ls())){
    hM1.se <- readRDS("/home/rqzhang/zlliu/R_data/human_M1_10x/hM1.se.rds")
}
if (!("hmca" %in% ls())){
    hmca <- readRDS("/home/rqzhang/zlliu/R_data/Human_Multiple_Cortical_Areas_SMART-seq/hmca.rds")
}

# 1.1、裁剪数据集
f_noNull <- function(lc_se, lc_col){
    if(any(lc_se$meta["sample_name"] != colnames(lc_se))){
        return
    }
    lc_idx = which(lc_se$meta[lc_col] == "")
    lc_res = lc_se[, -lc_idx]
    lc_res$meta = lc_se$meta[-lc_idx,]
    lc_res
}

hmca <- f_noNull(hmca, "subclass_label")

# 2、构造预测方法
f_get_pred <- function(scRNA, lc_i){
    
    # 如果已经计算过了,就不再重复计算,直接返回上一次的结果
    if(file.exists(sprintf( "%02d_hM1_subclass.txt", lc_i))){
        result_main_hpca <- read.table(sprintf( "%02d_hM1_subclass.txt", lc_i), stringsAsFactors = F)
        return (result_main_hpca)
    }
    
    # 导出原始counts矩阵
    test.count=as.data.frame(scRNA[["RNA"]]@counts)
    
    # 保留共同基因
    common_hpca <- intersect(rownames(test.count), c(rownames(hM1.se), rownames(hmca)))
    test.count_forhpca <- test.count[common_hpca,]
    tp_idx = na.omit(match(common_hpca, rownames(hM1.se)))
    lc_hM1.se <- hM1.se[rownames(hM1.se)[tp_idx], ]
    tp_idx = na.omit(match(common_hpca, rownames(hmca)))
    lc_hmca <- hmca[rownames(hmca)[tp_idx], ]

    gc()
    
    # 生成test数据集
    test.count_forhpca.se <- SummarizedExperiment(assays=list(counts=test.count_forhpca))
    test.count_forhpca.se <- logNormCounts(test.count_forhpca.se)
    
    # 进行分类预测 大概一小时
    pred.main.hpca <- SingleR(test = test.count_forhpca.se, ref = list(m1=lc_hM1.se, mca=lc_hmca), labels = list(hM1.se$meta$subclass_label, hmca$meta$subclass_label), BPPARAM=MulticoreParam(32)) # 32CPUs

    gc()
    
    #构造返回结果
    result_main_hpca <- as.data.frame(pred.main.hpca$labels)
    result_main_hpca$CB <- rownames(pred.main.hpca)
    colnames(result_main_hpca) <- c('hM1_subclass', 'CB')
    write.table(result_main_hpca, sprintf( "%02d_hM1_subclass.txt", lc_i)) #保存下来,方便以后调用
    result_main_hpca
}

# 3、进行预测
tp_sc <- scRNAs
# rm(x10s)
gc()
tp_sc_st <- 1
tp_sc_len <- length(tp_sc)
for (lc_i in tp_sc_st:tp_sc_len) {
    f_get_pred(tp_sc[[lc_i]], lc_i)
}

# 4、读取预测信息的函数
f_set_pred <- function(scRNA, result_main_hpca){
    scRNA@meta.data$CB <- rownames(scRNA@meta.data)
#     print(scRNA@meta.data$CB)
    scRNA@meta.data=merge(scRNA@meta.data,result_main_hpca,by="CB")
#     print(result_main_hpca$CB)
    rownames(scRNA@meta.data)=scRNA@meta.data$CB
    scRNA
}

for (lc_i in tp_sc_st:tp_sc_len) {
    tp_sc[[lc_i]] <- f_set_pred(tp_sc[[lc_i]], f_get_pred(tp_sc[[lc_i]], lc_i))
}

# 5、查找marker
f_findMarkers <- function(lc_scRNA, lc_group.by){
    Idents(lc_scRNA) <- lc_scRNA@meta.data[,lc_group.by]
    lc_markers <- FindAllMarkers(lc_scRNA, only.pos = TRUE, group.by = lc_group.by, min.pct = 0.25, logfc.threshold = 0.25)
    lc_markers <- subset(lc_markers, subset = p_val_adj<0.05)
    lc_markers
}

tp_markers  <- list()
for (lc_i in 1:length(tp_sc)) {
    tp_markers[[lc_i]] <- f_findMarkers(tp_sc[[lc_i]], 'hM1_subclass')
}

gl_MarkerGene <- read.csv(file.path("~/zlliu/R_data/hBLA", "cellType_Markers.csv"), header = F)

f_matchKownMarker <- function(lc_markers_all, lc_markers_kown){
    lc_CellType <- lc_markers_kown[match(rownames(lc_markers_all), lc_markers_kown[,1]),2]
    cbind(lc_markers_all, lc_CellType)
}

for (lc_i in 1:length(tp_sc)) {
    write.csv(f_matchKownMarker(tp_markers[[lc_i]], gl_MarkerGene), sprintf( "%02d_significant_markers_%s.csv", lc_i, names(tp_sc)[lc_i]))
}

gl_MarkerGene <- read.csv("~/zlliu/R_data/hBLA/TableS3.csv", header = T)

for (lc_i in 1:length(tp_sc)) {
    write.csv(f_matchKownMarker(tp_markers[[lc_i]], gl_MarkerGene), sprintf( "%02d_significant_markers_%s_full.csv", lc_i, names(tp_sc)[lc_i]))
}

第二步 构建 21.09.20.SingleR.pbs 并提交

#!/bin/bash
#PBS -q fat
#PBS -V
#PBS -o /home/rqzhang/zlliu/PBS/SingleR/SingleR.out
#PBS -e /home/rqzhang/zlliu/PBS/SingleR/SingleR.err
#PBS -l nodes=1:ppn=32
#PBS -r y
cd /home/rqzhang
Rscript /home/rqzhang/zlliu/PBS/SingleR/21.07.15.10x.hM1.se.R
echo $HOME
  • 提交job:qsub /home/rqzhang/zlliu/PBS/SingleR/21.09.20.SingleR.pbs
  • 查看所有任务:qstat

医学生