好久没有发简书了,去学习新技术新知识去了,实在是没有时间!!现在有点时间,记录分享一下新知识~
这段时间也是被这个eQTL搞得很头疼,现在终于弄出来,记录记录!
Matrix eQTL是一款针对大型矩阵可以超快运行进行eQTL分析的软件(http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/)。
一、首先就是数据的准备!(很重要)
先要准备4个文件(5个)
1.SNP信息文件和SNP位置文件,SNP信息文件需要012类型
好多人没有这个类型的snp文件,下面我就一步一步的教你们
#!/bin/bash
vcf=/your_path/all.rename_sorted.snp.vcf
## MAF、缺失率过滤
plink --vcf $vcf --make-bed --geno 0.1 --maf 0.05 --out snp.v1
plink --bfile snp.v1 --hardy
echo "计算所有位点的HWE的P值;生成plink.hwe"
plink --bfile snp.v1 -hwe 1e-4 -make-bed -out plink.hwe
echo "设定过滤标准1e-4l"
plink --bfile plink.hwe --recodeA --out snp.v2
echo "生成 .raw 文件,即为 012 矩阵,缺失的点标记为 NA"
cat snp.v2.raw | cut -d" " -f1,7- | sed 's/_[A-Z]//g' > genotype.txt
echo "2-6 列删除,并格式化 RS ID"
awk '{i=1;while(i <= NF){col[i]=col[i] $i " ";i=i+1}} END {i=1;while(i<=NF){print col[i];i=i+1}}' genotype.txt | sed 's/[ \t]*$//g' > genotype_transpose.txt
echo "用 awk 进行矩阵转置"
sed -i "s# #\t#g" genotype_transpose.txt
# SNP 位置信息可从 .map 文件获取:
plink --bfile plink.hwe --recode --out snp.v3
echo "snpid chr pos" >snpsloc.txt
awk '{print $2,"chr"$1,$4}' snp.v3.map >>snpsloc.txt
sed -i "s# #\t#g" snpsloc.txt
现在生成的genotype_transpose.txt
就是SNP信息文件,snpsloc.txt
就是SNP位置文件
head(genotype_transpose.txt)
FID DNA1 DNA2 DNA3 DNA4 DNA5 DNA6 DNA7 DNA8 DNA9 DNA10 DNA11 DNA12 DNA13 DNA14 DNA15 DNA16 DNA18 DNA19 DNA20 DNA21 DNA22 DNA23 DNA24 DNA25 DNA26 DNA27 DNA28 DNA30 DNA31 DNA32 DNA33
Chr1-30469 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
Chr1-30517 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0
Chr1-30533 0 1 1 1 1 1 2 0 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0
Chr1-69062 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
Chr1-89929 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0
Chr1-91327 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
Chr1-130169 0 0 0 0 0 2 0 1 0 2 0 0 1 0 0 0 0 0 0 0 2 0 1 0 0 1 2 0 0 0
Chr1-188094 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-190417 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-191152 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-191944 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-194442 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-199462 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-199533 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-204570 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-206474 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-217339 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-217421 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-217626 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-217663 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-220802 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-221433 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-221715 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-226134 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0 0 0 0
Chr1-226530 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-227955 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-235012 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
Chr1-241685 0 0 0 0 0 0 0 0 0 0 0
head(snpsloc.txt)
snpid Chr pos
Chr1-30469 Chr01 30469
Chr1-30517 Chr01 30517
Chr1-30533 Chr01 30533
Chr1-69062 Chr01 69062
Chr1-89929 Chr01 89929
Chr1-91327 Chr01 91327
Chr1-130169 Chr01 130169
Chr1-188094 Chr01 188094
Chr1-190417 Chr01 190417
2.准备自己的表达量矩阵,也可以对矩阵进行取log2,进行标准化
这个文章里面就是这个操作的:eQTL Regulating Transcript Levels Associated with Diverse Biological Processes in Tomato
3.准备自己的基因位置信息
grep "gene" geonom.gff3 |grep -v "ChrUn" | awk -F ";" '{print $1}' | sed -e "s/ID=//g" | awk '{print $9"\t"$1"\t"$4"\t"$5}' > geneloc.txt
head(geneloc.txt)
LOC_Os01g01010 Chr01 2903 10817
LOC_Os01g01019 Chr01 11218 12435
LOC_Os01g01030 Chr01 12648 15915
LOC_Os01g01040 Chr01 16292 20323
LOC_Os01g01050 Chr01 22841 26971
LOC_Os01g01060 Chr01 27136 28651
LOC_Os01g01070 Chr01 29818 34493
一定要注意,snpsloc.txt
和geneloc.txt
的Chr
一定要对应,不能一个是Chr一个是chr!!!
4.正式开始eQTL分析
library(MatrixEQTL)
## Settings
# Genotype file name
SNP_file_name = paste( "/your_path/genotype_transpose.txt", sep="\t");
snps_location_file_name = paste( "/your_path/snpsloc.txt", sep="\t");
# Gene expression file name
expression_file_name = paste( "/your_path/experssion.csv", sep=" ");
gene_location_file_name = paste( "/your_path/geneloc.txt", sep="");
## Load genotype data
snps = SlicedData$new();
snps$fileDelimiter = "\t"; # the TAB character
snps$fileOmitCharacters = "NA"; # denote missing values;
snps$fileSkipRows = 1; # one row of column labels
snps$fileSkipColumns = 1; # one column of row labels
snps$fileSliceSize = 2000; # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name);
## Load gene expression data
gene = SlicedData$new();
gene$fileDelimiter = "\t"; # the TAB character
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1; # one row of column labels
gene$fileSkipColumns = 1; # one column of row labels
gene$fileSliceSize = 2000; # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name);
## Run the analysis
snpspos = read.table(snps_location_file_name, header=TRUE, stringsAsFactors = FALSE, sep="\t");
genepos = read.table(gene_location_file_name, header=TRUE, stringsAsFactors = FALSE, sep="\t");
me = Matrix_eQTL_main(
snps = snps,
gene = gene,
output_file_name = "./modelANOVA.tra.txt",
pvOutputThreshold = 1e-5,
useModel = modelANOVA, # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
errorCovariance = numeric(),
verbose = TRUE,
output_file_name.cis = "./modelANOVA.cis.txt",
pvOutputThreshold.cis = 1e-5,
snpspos = snpspos,
genepos = genepos,
cisDist = 2e5,
pvalue.hist = "qqplot",
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE);
pdf("modelANOVA.pdf")
plot(me)
dev.off()
cisDist
界定trans和cis的阈值;pvOutputThreshold
判断trans的阈值;pvOutputThreshold.cis
判断cis的阈值
可以根据自己物种基因组大小,基因-基因间区大小设置cisDist
。
5.用曼哈顿图----展示eQTL结果
# 加载所需的包
library(qqman)
# 读取文件
eQTL_results <- read.table("std3.modelANOVA.tra.txt", header=TRUE, sep="\t")
# 分离SNP列以获取染色体号和位置
eQTL_results$CHR <- as.numeric(gsub("Chr", "", gsub("-.*", "", eQTL_results$SNP)))
eQTL_results$BP <- as.numeric(gsub(".*-", "", eQTL_results$SNP))
# 确保使用未转换的p-value列 'p.value'
plot_data <- eQTL_results[, c("SNP", "CHR", "BP", "p.value")]
# 检查新的数据框
head(plot_data)
# 绘制曼哈顿图
pdf(file="modelANOVA.trans.pdf", width=11, height=8.5)
manhattan(plot_data, chr="CHR", bp="BP", p="p.value", snp="SNP", main="eQTL Manhattan Plot", col=c("blue4", "orange3"))
dev.off()