1. 数据准备
# 下载所需的数据集
$ mkdir GSE20652
$ wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE206nnn/GSE206528/suppl/GSE206528_RAW.tar
# 解压
$ tar -xf GSE206528_RAW.tar -C GSE20652/
$ cd GSE20652
$ gunzip *.gz
2. 数据整合
# 加载所需的R包
library(Seurat)
library(dplyr)
library(readr)
# 设置文件夹路径
file_path <- "/data/shumin/ED/GSE206528/"
# 1. 列出所有 CSV 文件
file_names <- list.files(path = file_path, pattern = "*.csv", full.names = TRUE)
# 2. 创建函数来读取文件和创建 meta 数据框
process_file <- function(file) {
# 读取 CSV 文件,设置 row.names = 1 将第一列作为行名
data <- read.csv(file, row.names = 1)
# 计算列数(即重复次数)
num_columns <- ncol(data)
# 获取文件名并提取样本名和分组信息
file_name <- basename(file)
split_name <- strsplit(file_name, "_")[[1]]
sample_name <- split_name[1]
group <- split_name[2]
# 创建一个 meta 数据框,每行记录一个样本的重复次数
meta <- data.frame(
Sample = rep(sample_name, num_columns),
Group = rep(group, num_columns),
stringsAsFactors = FALSE
)
return(list(data = data, meta = meta))
}
# 3. 使用 lapply 处理所有文件
results <- lapply(file_names, process_file)
# 提取所有数据和 meta 数据框
data_list <- lapply(results, `[[`, "data")
meta_list <- do.call(rbind, lapply(results, `[[`, "meta"))
# 4. 合并所有数据框
combined_data <- do.call(cbind, data_list)
# 5. 创建 Seurat 对象
seurat_obj <- CreateSeuratObject(counts = combined_data)
# 6. 将 meta 数据添加到 Seurat 对象
seurat_obj <- AddMetaData(seurat_obj, metadata = meta_list)
head(seurat_obj)
## orig.ident nCount_RNA nFeature_RNA Sample Group
## LZ037.AAACCCAAGAGGGTGG.1_1 SeuratProject 4407 1673 GSM6255907 Normal
## LZ037.AAACCCAAGATACGAT.1_1 SeuratProject 6786 2275 GSM6255907 Normal
## LZ037.AAACCCAAGATGGCAC.1_1 SeuratProject 5277 2130 GSM6255907 Normal
## LZ037.AAACCCAAGGGAGTGG.1_1 SeuratProject 5773 2050 GSM6255907 Normal
## LZ037.AAACCCAAGTTGGACG.1_1 SeuratProject 6518 2099 GSM6255907 Normal
## LZ037.AAACCCACAACTTCTT.1_1 SeuratProject 1595 838 GSM6255907 Normal
## LZ037.AAACCCACAATCTCGA.1_1 SeuratProject 6017 2047 GSM6255907 Normal
## LZ037.AAACCCACAATGCTCA.1_1 SeuratProject 3954 1401 GSM6255907 Normal
## LZ037.AAACCCACAGCCGGTT.1_1 SeuratProject 11695 3281 GSM6255907 Normal
## LZ037.AAACCCAGTATGATCC.1_1 SeuratProject 6469 2092 GSM6255907 Normal
# 数据标准化
seurat_obj <- NormalizeData(seurat_obj) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose=FALSE)
# harmony及降维
scRNA <- RunHarmony(seurat_obj, group.by.vars = "Sample")
scRNA <- FindNeighbors(scRNA, reduction = "harmony", dims = 1:20) %>% FindClusters(resolution = 0.6)
scRNA <- RunUMAP(scRNA, reduction = "harmony")
scRNA <- RunTSNE(scRNA, reduction = "harmony")