Download released RNA-seq Data
- direct download SRA files using idm
- sra to fastq
fastq-dump --gzip --split-files {}.sra
From company RNA-seq Data
MD5 Check
Goal
check the integrity of RNA-SEQ data
Scripts
check md5
md5sum test_1.fastq.gz > md5_cal.txt
compare md5 value (or direct ctrl+find)
diff md5_cal.txt md5_from_com.txt
FastQC
Goal
spot potential problems in high througput sequencing datasets
Package
FastQC
Scripts
Install FastQC
conda install fastqc
Apply FastQC
fastqc -o <%.fq.gz> <%.fq.gz>
Input
sequenced data.fq.gz
Output
web
see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
Trimmomatic
Goal
trimming illumine paired-end and single ended date
Package
trimmomatic
Scripts
Install trimmomatic
conda install trimmomatic
Apply trimmomatic
java -jar trimmomatic-0.38.jar PE -phred33 <input_1%.fq.gz> <input_2%.fq.gz> <output_1_paird%.fq.gz> <output_1_unpaired%.fq.gz> <output_2_paird%.fq.gz> <output_2_unpaired%.fq.gz> A2_1_paired.fq.gz A2_1_unpaired.fq.gz A2_2_paired.fq.gz A2_2_unpaired.fq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:14 MINLEN:36
Input
sequenced date.fq.gz
Output
output_1_paird%.fq.gz output_1_unpaired%.fq.gz
output_2_paird%.fq.gz output_2_unpaired%.fq.gz
see https://github.com/usadellab/Trimmomatic
Trinity
Goal
de novo transcriptome assembly
Package
Trinity
Scripts
Install Trinity
conda create -n trinity_2.8.5 python=3.7 #create env for trinity
conda install trinity=2.8.5
Apply Trinity
#SBATCH --mem=50G
source ~/.bashrc
Hash -r
module load bowtie/2.4.2
module load samtools/1.14
source activate
conda activate trinity_2.8.5
Trinity --seqType fq --trimmomatic --no_version_check --max_memory 50G \
--left <%_1.fq.gz> \
--right <%_2.fq.gz> \
--CPU 12 --output ./Trinity_out
Input
<%_1.fq.gz>
<%_2.fq.gz>
Output
Trinity.fasta
See https://github.com/trinityrnaseq/trinityrnaseq
Transdecoder
Goal
find coding regions within transcripts
Package
transdecoder
Scripts
source ~/.bashrc
conda activate transdecoder
module load blast/2.11.0+
module load hmmer/3.3.2
# find open reading frames
TransDecoder.LongOrfs -t Trinity.fasta
#blast orf and Swissport database
blastp -query Trinity.fasta.transdecoder_dir/longest_orfs.pep -db swissprot.00 -max_target_seqs 1 -outfmt 6 -evalue 1e-5 -num_threads 12 > blastp.outfmt6
#compare orf and pfam protein database
hmmscan --cpu 12 --domtblout pfam.domtblout Pfam-A.hmm Trinity.fasta.transdecoder_dir/longest_orfs.pep
#predict cds region and translate pep
TransDecoder.Predict -t Trinity.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6
Input
Trinity.fasta
Swissport database
pfam protein database
Output
Trinity.fasta.transdecoder.pep
Trinity.fasta.transdecoder.gff3
Trinity.fasta.transdecoder.cds
Trinity.fasta.transdecoder.bed
cd-hit/py
Goal
Obtain the longest isoform peptide
Package
python
Scripts
#Install biopython
pip install biopython
#Apply python scripts
python pick_the_longest_isoform_for_trinity.py Trinity.fasta.transdecoder.pep
Input
pick_the_longest_isoform_for_trinity.py
Trinity.fasta.transdecoder.pep
Output
Trinity.fasta.transdecoder_longest_iso.fas
Orthofinder
Goal
find single ortholog gene
Package
orthofinder
Scripts
Install orthofinder
conda create -n orthofinder
conda install orthofinder
Apply orthofinder
source ~/.bashrc
conda activate orthofinder
ulimit -n 10000
orthofinder -f %nr_pep.filefolder
Input
all nr_pep.fasta
Output
Single_Copy_Orthologue_Sequences
see https://github.com/davidemms/OrthoFinder
Muscle/Mafft Align
Goal
Align single copy otrthologue sequences
Package
muscle/mafft
Scripts
Install muscle/mafft
conda install muscle/mafft
Apply muscle/mafft
perl bsub_muscle.edioted.pl -d Single_Copy_Orthologue_Sequences/
Input
Single_Copy_Orthologue_Sequences.filefolder
Output
muscle_out/mafft_out.filefolder
see https://github.com/GSLBiotech/mafft
see https://github.com/rcedgar/muscle
see https://www.runoob.com/perl/perl-tutorial.html
Unify the sequence title And Concentrate
Package
parallel
Install parallel
see linux下的parallel:强大的并行 - 简书 (jianshu.com) (侵删)
Scripts
Unify the sequence title
ls | awk -F '.' '{print $1}' > name.txt
parallel "sed -i 's/>/&Group_{}|/g' {}.out" :::: name.txt
parallel "sed -i 's/TRINITY/|&/g' {}.out" :::: name.txt
parallel "sed -i 's/comp/|&/g' {}.out" :::: name.txt
sed -i 's/PPYR_/PPY|&/g' *.out
sed -i 's/AQULA_/AQU|&/g' *.out
sed -i 's/ILUMI_/ILU|&/g' *.out
sed -i 's/Lamprigera_yunnana_/LYU|&/g' *.out
sed -i 's/Abscontida_termibalis_/ATE|&/g' *.out
sed -i 's/|PPE/&|/g' *.out
sh rename.sh
eg: Group_OG0005508|ELI|TRINITY_DN3583_c0_g1_i3.p1
supermatrix
perl parse_bsub_mafft_out.pl -d muscle_out/ -o muscle.aln.concatenated.fa
Input
muscle_out/mafft_out.filefolder
Output
muscle_out/mafft_out.filefolder
GBlocks
Goal
Define blocks using Go-lang types.
Simplify mutations to reduce the need for per-block custom code.
Build toolboxes from Go-lang instances.
Mirror Go-lang data to/from Blocky blocks ( to provide alternative serialization and processing. )
Package
GBlocks
Scripts
Install GBlocks
conda install GBlocks
Apply GBlocks
Gblocks muscle.aln.concatenated.fa -t=p -b4=10 -b5=n #amino acid sequence
Input
muscle.aln.concatenated.fa
Output
muscle.aln.concatenated.fa-gb
muscle.aln.concatenated.fa-gb.htm
Format transform fasta to phy
Goal
transform format: fasta to phy for Constructing ML tree
Software
Geneious
Input
muscle.aln.concatenated.fa-gb.fasta
Output
muscle.aln.concatenated.fa-gb.phy
RAxML
Goal
Construct the ML phylogenetic tree and explore species relationship
Package
raxml
Scripts
Install raxml
conda install raxml
Apply raxml
sh1:
raxmlHPC -T 12 -f a -x 12345 -p 12345 -# 100 -m PROTGAMMAAUTO -o LAB -s /storage/zhenyingLab/xuxiaodong/Tree/44_muscle.aln.concatenated_gb.phy -n 44_tree_zhu
sh2:
raxmlHPC-PTHREADS -T 24 -m PROTGAMMAJTT -s muscle.aln.concatenated.fa-gb.nogap.sl.phy -p 123456 -n _tree
raxmlHPC-PTHREADS -T 24 -p 123456 -b 1234567 -# 100 -s muscle.aln.concatenated.fa-gb.nogap.sl.phy -n _tree_bs -m PROTGAMMAJTT
raxmlHPC-PTHREADS -f b -t RAxML_bestTree._tree -z RAxML_bootstrap._tree_bs -m PROTGAMMAJTT -n consensustree
Input
muscle.aln.concatenated.fa-gb.phy
Output
RAxML_bootstrap._tree
Kallisto + Sleuth for differential gene expression
Kallisto
Goal
map reads on the ref genome and count reads/transcripts
Package
Kallisto
Scripts
establish index using ref_cds_genome/ref_rna
kallisto index -i ref_rna.idx {}_reference.fasta
count reads
kallisto quant -i ref_rna.idx -o output_{} -b 100 {}_Clean_Data1_paired.fq.gz {}_Clean_Data2_paired.fq.gz
Input
Ref_cds_genome
Clean_Data_fa.gz
Output
abundance.h5
abundance.tsv
run_info.json
Sleuth in R.
Goal
compare the abundance of different groups and get the differential gene expression
Package
R
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install()
BiocManager::install("devtools") # only if devtools not yet installed
BiocManager::install("pachterlab/sleuth")
Scripts
建立矩阵
#library_name label method breed stage replicate
kallisto_output_tr_1 tr mute 1 stage1 r1
kallisto_output_tr_2 tr mute 1 stage1 r2
kallisto_output_tr_3 tr mute 1 stage1 r3
kallisto_output_ct_1 ct wildtype 2 stage1 r1
kallisto_output_ct_2 ct wildtype 2 stage1 r2
kallisto_output_ct_3 ct wildtype 2 stage1 r3
Differential gene expression analysis in R scripts
##Set up paths
base_dir <- "1"
sample_path=paste(base_dir,"sleuth_sample_info_DE.tsv", sep="")
sample_path
sample_df <- read.delim(sample_path,header=F, row.names=NULL, comment.char="#")
sample_df
sample_name <- "2"
sample_name
samples <- sample_df[[1]]
kal_dirs <- sapply(samples, function(id) file.path(base_dir, id))
methods <- sample_df[[3]]
breeds <- sample_df[[4]]
stages <- sample_df[[5]]
sample_metadata <- data.frame(path=kal_dirs, sample=samples, method=methods, breed=breeds, stage=stages, stringsAsFactors=FALSE)
sample_metadata
library(sleuth)
so <- sleuth_prep(sample_metadata, num_cores = 1, ~methods,read_bootstrap_tpm = TRUE, extra_bootstrap_summary = TRUE)
so <- sleuth_fit(so, ~methods, 'full')
so <- sleuth_wt(so,which_beta = 'methodswildtype')
models(so)
##DE expression filtering
sleuth_table <- sleuth_results(so, "methodswildtype", show_all = TRUE)
sleuth_significant <- dplyr::filter(sleuth_table, qval <= 0.05)
sleuth_differential_expressed_wildtype <- dplyr::filter(sleuth_significant, b > 1)
head(sleuth_significant, 20)
sleuth_significant
##HE expression filtering
library(magrittr)
ktable_normalized <- kallisto_table(so, use_filtered = FALSE, normalized = TRUE, include_covariates = TRUE)
head(ktable_normalized,20)
ktable_normalized_wildtype <- subset(ktable_normalized, method=='wildtype')
head(ktable_normalized_wildtype,20)
ktable_normalized_mute <- subset(ktable_normalized, method=='mute')
head(ktable_normalized_mute,20)
ktable_orig <- kallisto_table(so, use_filtered = FALSE, normalized = FALSE, include_covariates = FALSE)
head(ktable_orig,20)
wildtype_bsn_tpm <- dplyr::group_by(ktable_normalized_wildtype, target_id) %>% dplyr::summarize(bsn_tpm_wildtype_mean = mean(tpm))
head(wildtype_bsn_tpm,20)
mute_bsn_tpm <- dplyr::group_by(ktable_normalized_mute, target_id) %>% dplyr::summarize(bsn_tpm_mute_mean = mean(tpm))
head(mute_bsn_tpm,20)
mute_bsn_tpm_sorted <- dplyr::arrange(mute_bsn_tpm,desc(bsn_tpm_mute_mean))
head(mute_bsn_tpm_sorted,20)
wildtype_bsn_tpm_sorted <- dplyr::arrange(wildtype_bsn_tpm,desc(bsn_tpm_wildtype_mean))
head(wildtype_bsn_tpm_sorted,20)
wildtype_bsn_tpm_highly_expressed <- subset(wildtype_bsn_tpm_sorted, bsn_tpm_wildtype_mean >= quantile(bsn_tpm_wildtype_mean, 0.90))
head(wildtype_bsn_tpm_highly_expressed,20)
##Table exporting
write.table(ktable_normalized,paste(base_dir,sample_name,".sleuth.BSN-TPM.txt",sep=""), sep="\t",quote = FALSE)
write.table(sleuth_table, paste(base_dir,sample_name,".sleuth.DE_test.txt",sep=""), sep="\t",quote = FALSE)
write.table(sleuth_differential_expressed_wildtype, paste(base_dir,sample_name,".sleuth.DE_test.onlysignificant.wildtype-up.txt",sep=""), sep="\t",quote = FALSE)
write.table(mute_bsn_tpm_sorted, paste(base_dir,sample_name,".sleuth.bs-normalized.TPM.txt",sep=""), sep="\t",quote = FALSE)
write.table(wildtype_bsn_tpm_sorted, paste(base_dir,sample_name,".sleuth.bs-normalized.TPM.wildtype.txt",sep=""), sep="\t",quote = FALSE)
write.table(wildtype_bsn_tpm_highly_expressed, paste(base_dir,sample_name,".sleuth.bs-normalized.TPM.90thPercentile.txt",sep=""), sep="\t",quote = FALSE)
##Manual inspection
sleuth_live(so)
Input
abundance.tsv
矩阵文件.tsv
Output
the files include reads, tpm, p-value and FDR...
Hisat2, StringTie and Deseq2/edgeR for differential gene expression
Hisat2
Goal
map reads
Package
Hisat2
Scripts
establish index
hisat2-build ref_whole_genome output_ref.fa
map reads
#####align reads
hisat2 --dta -t -p 4 -x ref_genome -1 paired.fa.gz -2 paired.fa.gz -S {}.sam
#####convert sam format to bam
samtools view -S {}.sam -b -o {}.bam
#####sort bam
samtools sort -o {}.sorted.bam {}.bam
#####build index
samtools index {}_sorted.bam {}_sorted.bam.bai
#####stat mapping rate
samtools flagstat {}_sorted.bam > {}_sorted.flag.stat
Input
ref_whole_genome
files.fa.gz
Output
sorted.bam
sorted.flag.stat (mapping rate)
StringTie
Goal
assemble reads and count reads/transcripts
Scripts
#####assemble reads
stringtie -e -B -p 4 -G gff_file -o {}.gtf {}_sorted.bam
#####count reads
python prepDE.py -i gtf_files
Input
sorted.bam
gff_file
Output
reads_file
Deseq2
Goal
differential gene expression
Package
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("DESeq2")
library(DESeq2)
Scripts
CountMatrix1<-read.csv("D:/all_gene_count_matrix.csv",sep=",",row.names="gene_id")
#设置样本信息矩阵,包括处理信息:实验组ctrlrep vs. 对照组ISrep,每个有3个
ColumnData<- data.frame(row.names=colnames(CountMatrix1),samName=colnames(CountMatrix1), condition=rep(c("control","treat"),each=3))
CountMatrix1[is.na(CountMatrix1)] <- 0
#生成DESeqDataSet数据集
dds <- DESeqDataSetFromMatrix(countData = CountMatrix1, colData = ColumnData, design= ~condition)
#DESeq差异表达计算
dds<-DESeq(dds)
#生成差异表达结果
res<-results(dds)
summary(res)
#查看总结信息(表达上调,下调等)
head(res)
#统计padj(adjusted p-value)小于0.05的数目
table(res$padj <0.05)
#统计padj(adjusted p-value)小于0.05的数目
res<- res[order(res$padj),]#按padj排序
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
write.csv(resdata,file = "control_treat.csv")
#输出结果到csv文件
deg <- subset(res, padj <= 0.01 & abs(log2FoldChange) >= 1) #筛选显著差异表达基因(padj小于0.01且FoldChange绝对值大于2)
summary(deg) #查看筛选后的总结信息
write.csv(deg, "control_treat.deg.csv") #将差异表达显著的结果输出到csv文件
Input
{}_count.csv
Output
all_gene_count.csv
differential_gene_count.csv
edgeR
Goal
differential gene expression
Package
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("edgeR")
library(edgeR)
Scripts
#读取基因表达矩阵
targets <- read.csv("D:/Download/{}_count.csv",header = T,check.names = F)
row.names(targets) <- make.names(targets[,1],TRUE)
targets <- targets[,-1]
head(targets)
#指定分组,注意要保证表达矩阵中的样本顺序和这里的分组顺序是一一对应的
#对照组在前,处理组在后
group <- rep(c('control', 'treat'), each = 3)
#数据预处理
#(1)构建 DGEList 对象
dgelist <- DGEList(counts = targets, group = group)
#(2)过滤 low count 数据,例如 CPM 标准化(推荐)
keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep, , keep.lib.sizes = FALSE]
#(3)标准化,以 TMM 标准化为例
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')
#差异表达基因分析
#首先根据分组信息构建试验设计矩阵,分组信息中一定要是对照组在前,处理组在后
design <- model.matrix(~group)
#(1)估算基因表达值的离散度
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
#(2)模型拟合,edgeR 提供了多种拟合算法
#负二项广义对数线性模型
fit <- glmFit(dge, design, robust = TRUE)
lrt <- topTags(glmLRT(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmLRT.txt', sep = '\t', col.names = NA, quote = FALSE)
#拟似然负二项广义对数线性模型
fit <- glmQLFit(dge, design, robust = TRUE)
lrt <- topTags(glmQLFTest(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmQLFit.txt', sep = '\t', col.names = NA, quote = FALSE)
##筛选差异表达基因
#读取上述输出的差异倍数计算结果
gene_diff <- read.delim('control_treat.glmLRT.txt', row.names = 1, sep = '\t', check.names = FALSE)
#首先对表格排个序,按 FDR 值升序排序,相同 FDR 值下继续按 log2FC 降序排序
gene_diff <- gene_diff[order(gene_diff$FDR, gene_diff$logFC, decreasing = c(FALSE, TRUE)), ]
#log2FC≥1 & FDR<0.01 标识 up,代表显著上调的基因
#log2FC≤-1 & FDR<0.01 标识 down,代表显著下调的基因
#其余标识 none,代表非差异的基因
gene_diff[which(gene_diff$logFC >= 1 & gene_diff$FDR < 0.01),'sig'] <- 'up'
gene_diff[which(gene_diff$logFC <= -1 & gene_diff$FDR < 0.01),'sig'] <- 'down'
gene_diff[which(abs(gene_diff$logFC) <= 1 | gene_diff$FDR >= 0.01),'sig'] <- 'none'
#输出选择的差异基因总表
gene_diff_select <- subset(gene_diff, sig %in% c('up', 'down'))
write.table(gene_diff_select, file = 'control_treat.glmQLFit.select.txt', sep = '\t', col.names = NA, quote = FALSE)
#根据 up 和 down 分开输出
gene_diff_up <- subset(gene_diff, sig == 'up')
gene_diff_down <- subset(gene_diff, sig == 'down')
write.table(gene_diff_up, file = 'control_treat.glmQLFit.up.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(gene_diff_down, file = 'control_treat.glmQLFit.down.txt', sep = '\t', col.names = NA, quote = FALSE)
Input
{}_count.csv
Output
all_gene_count.csv
differential_gene_count.csv