数据分析:基因突变瀑布图统计以及可视化

image.png

介绍

基因突变瀑布图统计以及可视化

数据下载

MongolianHCC数据集:

image.png

加载R包

library(maftools)
library(TCGAmutations)
library(circlize)
library(ComplexHeatmap)
library(RColorBrewer)
library(openxlsx)
library(dplyr)

代码

PROJECT_DIR <- "./MongolianHCC/" # replace this line with your local path

source(file.path(PROJECT_DIR, "SCRIPTS", "helper_functions.oncoplot.R"))

sample_info_file <- file.path(PROJECT_DIR, "DATA", "PROCESSED", "patient_sample_metadata_w_clustering_risk.txt")
driver_res_file <- file.path(PROJECT_DIR, "DATA", "ORIGINAL", "mut_full.sig_genes.txt")

out_dir <- file.path(PROJECT_DIR, "RESULTS", "mut_oncoplot")
maf_file <- file.path(PROJECT_DIR, "DATA", "PROCESSED", "maf_after_all_filters.maf")
sample_info.exome.file <- sample_info_file

driver_results_file <- driver_res_file
driver_sig_col <- "q"
driver_sig_thresh <- 0.1
driver_sig_freq <- 0.05
cohort_freq_thresh <- 0.2
# cohort_freq_thresh=NA
gene_list_file <- NA

source(file.path(PROJECT_DIR, "SCRIPTS", "helper_functions.oncoplot.R"))

if (!dir.exists(out_dir)) {
  dir.create(out_dir, recursive = T)
}

if (grepl(".xlsx$", sample_info.exome.file)) {
  library(openxlsx)
  sample_info.exome <- read.xlsx(sample_info.exome.file)
} else {
  sample_info.exome <- read.table(sample_info.exome.file, sep = "\t", header = T, stringsAsFactors = F)
  sample_info.exome$Tumor_Sample_Barcode <- sample_info.exome$WES_T
}

ignoreGenes <- c("TTN", "MUC16", "OBSCN", "AHNAK2", "SYNE1", "FLG", "MUC5B", 
                 "DNAH17", "PLEC", "DST", "SYNE2", "NEB", "HSPG2", "LAMA5", 
                 "AHNAK", "HMCN1", "USH2A", "DNAH11", "MACF1", "MUC17", "DNAH5", 
                 "GPR98", "FAT1", "PKD1", "MDN1", "RNF213", "RYR1", "DNAH2", 
                 "DNAH3", "DNAH8", "DNAH1", "DNAH9", "ABCA13", "SRRM2", "CUBN", 
                 "SPTBN5", "PKHD1", "LRP2", "FBN3", "CDH23", "DNAH10", "FAT4", 
                 "RYR3", "PKHD1L1", "FAT2", "CSMD1", "PCNT", "COL6A3", "FRAS1", 
                 "FCGBP", "RYR2", "HYDIN", "XIRP2", "LAMA1")
# sort(ignoreGenes)

mafObj <- read.maf(maf_file, clinicalData = sample_info.exome)
maf.filter <- mafObj

frac_mut <- data.frame(
  Hugo_Symbol = maf.filter@gene.summary$Hugo_Symbol,
  frac_mut = (maf.filter@gene.summary$MutatedSamples / as.numeric(maf.filter@summary$summary[3])),
  stringsAsFactors = F
)
frac_mut[is.na(frac_mut)] <- 0

# driver_res_file="data/somatic.sig_genes.txt"
# driver_sig_col="q"
# driver_sig_thresh=0.1
# driver_sig_freq=0.05
if (file.exists(driver_res_file)) {
  driver_res <- read.table(driver_res_file, sep = "\t", header = T, quote = "", stringsAsFactors = F)
  colnames(driver_res)[colnames(driver_res) == "gene"] <- "Hugo_Symbol"
  driver_res$FLAG_gene <- driver_res$Hugo_Symbol %in% ignoreGenes
  driver_res_plus <- merge.data.frame(driver_res, frac_mut)
  driver_res_plus <- driver_res_plus[order(driver_res_plus[, driver_sig_col], decreasing = F), ]
  driver_genes <- driver_res_plus$Hugo_Symbol[driver_res_plus[, driver_sig_col] < driver_sig_thresh & driver_res_plus$frac_mut > driver_sig_freq]
} else {
  driver_res_plus <- frac_mut
  driver_genes <- c()
}

# driver_res_plus <- driver_res
cohort_freq_thresh <- 0.1
if (!is.na(cohort_freq_thresh)) {
  freq_genes <- setdiff(driver_res_plus$Hugo_Symbol[driver_res_plus$frac_mut > cohort_freq_thresh], driver_genes)
} else {
  freq_genes <- NULL
}

if (file.exists(as.character(gene_list_file))) {
  custom_gene_list <- read.table(gene_list_file, stringsAsFactors = F)[, 1]
  custom_gene_list <- setdiff(custom_gene_list, c(driver_genes, freq_genes))
} else {
  custom_gene_list <- NULL
}

gene_list <- list(driver_genes, freq_genes, custom_gene_list)
reasons <- c(
  paste0("Driver Gene, ", driver_sig_col, " < ", driver_sig_thresh),
  paste0("Cohort Freq > ", cohort_freq_thresh),
  paste0("Selected Genes")
)

genes_for_oncoplot <- data.frame(Hugo_Symbol = c(), reason = c())
for (i in 1:length(gene_list)) {
  if (is.null(gene_list[[i]][1])) {
    next
  }
  genes_for_oncoplot <- rbind(
    genes_for_oncoplot,
    data.frame(
      Hugo_Symbol = gene_list[[i]],
      reason = reasons[i]
    )
  )
}
genes_for_oncoplot <- cbind(genes_for_oncoplot,
  frac = driver_res_plus$frac_mut[match(genes_for_oncoplot$Hugo_Symbol, driver_res_plus$Hugo_Symbol)]
)

genes_for_oncoplot <- genes_for_oncoplot[!is.na(genes_for_oncoplot$frac), ]
genes_for_oncoplot <- genes_for_oncoplot[order(genes_for_oncoplot$reason, -genes_for_oncoplot$frac), ]
split_idx <- as.character(genes_for_oncoplot$reason)
split_idx <- factor(split_idx, levels = reasons[reasons %in% split_idx])
split_colors <- rainbow(length(levels(split_idx)))
names(split_colors) <- as.character(genes_for_oncoplot$reason[!duplicated(genes_for_oncoplot$reason)])
split_colors <- list(Reason = split_colors)

# source("scripts/helper_functions.R")
oncomat <- createOncoMatrix(maf.filter, g = genes_for_oncoplot$Hugo_Symbol, add_missing = F)$oncoMatrix
write.table(genes_for_oncoplot, file = paste0(out_dir, "/genes_for_oncoplot.txt"), sep = "\t", quote = F, row.names = F)

include_all <- T
if (include_all) {
  ### createOncoMatrix drops empty samples, so this adds them back in
  all_wes_samples <- as.character(sample_info.exome$Tumor_Sample_Barcode[!is.na(sample_info.exome$Tumor_Sample_Barcode)])
  extra_samples <- setdiff(all_wes_samples, colnames(oncomat))
  empty_data <- matrix(data = "", nrow = nrow(oncomat), ncol = length(extra_samples), dimnames = list(rownames(oncomat), extra_samples))
  oncomat <- cbind(oncomat, empty_data)
}

oncomat <- oncomat[match(genes_for_oncoplot$Hugo_Symbol, rownames(oncomat)), ]
onco_genes <- rownames(oncomat)

#### TCGA Comparison Heatmap (from helper_functions.oncoplot.R)
tcga_comparison_results <- make_TCGA_comparison_heatmap(onco_genes, maf.filter, split_at = split_idx)
tcga_comparison_hm <- tcga_comparison_results[[1]]


#### MONGOLIA oncoplot
## Oncoplot parameters
annotate_empty <- ""
annotation_font_size <- 9
annotation_height_frac <- 0.3
onco_width <- 9
onco_height <- NULL

oncomat.plot <- oncomat
colnames(oncomat.plot) <- sample_info.exome$Patient[match(colnames(oncomat.plot), sample_info.exome$Tumor_Sample_Barcode)]

if (is.null(onco_height)) {
  onco_height <- max(round(0.2 * nrow(oncomat.plot), 0) + 2, 5)
}

hm_anno_info <- as.data.frame(sample_info.exome[match(colnames(oncomat.plot), sample_info.exome$Patient), ])
rownames(hm_anno_info) <- hm_anno_info$Patient
hm_anno_info <- hm_anno_info[, c(10:ncol(hm_anno_info))]
hm_anno_info$sex <- ifelse(is.na(hm_anno_info$sex), "NA", ifelse(hm_anno_info$sex == 1, "M", "F"))


grouping_var <- "hdv"
group_counts <- c(0, table(hm_anno_info[, grouping_var], useNA = "ifany"))
names(group_counts) <- c(names(group_counts)[2:length(group_counts)], "last")
group_levels <- sort(unique(hm_anno_info[, grouping_var]))
plot_idx <- c()
for (currgroup_idx in 1:length(group_levels)) {
  # currgroup_idx=1
  currgroup <- group_levels[currgroup_idx]
  curr_subset <- hm_anno_info[hm_anno_info[, grouping_var] == currgroup, ]
  order_classes <- factor(ifelse(is.na(as.character(curr_subset$class)), "Unknown", as.character(curr_subset$class)))
  curr_idx <- orderByGroup(oncomat.plot[, colnames(oncomat.plot) %in% rownames(curr_subset)], order_classes)
  plot_idx <- c(
    plot_idx,
    rownames(curr_subset)[curr_idx]
  )
}

plot_order <- plot_idx

################### TOGGLE FOR TOP/BOTTOM ANNOTATION ###################
##### This is for only class/cluster on top
# top_anno_names <- c("Class")
# names(top_anno_names) <- c("class")
# bot_anno_names <- c("Sex","Age","HCV","HBV","HDV","Stage","Tumor Size","Cirrhosis","Obesity","Alcohol","Smoker","Family History","alb","bil","alt","afp","multinodular","Risk Group")
# names(bot_anno_names) <- c("sex","age_bin","hcv","hbv","hdv","stage","tumor_size","cirrhosis","obesity","alcohol","smoker","FHx_LC","alb","bil","alt","afp","multinodular","risk_bin") # column names
################### TOGGLE FOR TOP/BOTTOM ANNOTATION ###################
##### This is for cluster+HDV on top
top_anno_names <- c("Class", "HDV", "HBV", "HCV")
names(top_anno_names) <- c("class", "hdv", "hbv", "hcv")
bot_anno_names <- c(
  "Sex", "Age", "Obesity", "Smoking", "Alcohol", "Family History", "Cirrhosis",
  "alb", "bil", "alt", "afp", "Tumor Size", "multinodular", "Stage", "Risk Group"
)

names(bot_anno_names) <- c("sex", "age_bin", "obesity", "smoker", "alcohol", 
                           "FHx_LC", "cirrhosis", "alb", "bil", "alt", "afp", 
                           "tumor_size", "multinodular", "stage", "risk_bin") # column names
########################################################################


hm_anno_info <- hm_anno_info[, unique(c(names(top_anno_names), names(bot_anno_names)))]

annocolors <- my_oncoplot_colors(hm_anno_info)


my_types <- unique(unlist(apply(oncomat.plot, 2, unique)))
my_types <- my_types[!my_types %in% c(NA, "", 0)]

col <- c(
  Nonsense_Mutation = "#ad7aff", 
  Missense_Mutation = "#377EB8", 
  Frame_Shift_Del = "#4DAF4A",
  In_Frame_Ins = "#ff008c", 
  Splice_Site = "#FF7F00", 
  Multi_Hit = "#FFFF33", 
  Frame_Shift_Ins = "#A65628",
  In_Frame_Del = "#f781bf", 
  Translation_Start_Site = "#400085", 
  Nonstop_Mutation = "#b68dfc",
  no_variants = "#d6d6d6"
)

top_anno_data <- data.frame(hm_anno_info[, names(top_anno_names)], row.names = rownames(hm_anno_info))
colnames(top_anno_data) <- names(top_anno_names)
bot_anno_data <- data.frame(hm_anno_info[, names(bot_anno_names)], row.names = rownames(hm_anno_info))
colnames(bot_anno_data) <- names(bot_anno_data)

unmutated_annodata <- ifelse(colSums(nchar(oncomat.plot)) == 0, annotate_empty, "")

annocolors$empty <- c("TRUE" = "black", "FALSE" = "white")
top_height <- onco_height * annotation_height_frac * (ncol(top_anno_data) / ncol(bot_anno_data))
top_anno <- HeatmapAnnotation(
  empty = anno_text(
    unmutated_annodata, 
    gp = gpar(fontsize = 6, fontface = "bold", col = "grey10"), 
    location = unit(0.5, "npc"),
    which = "column"),
  df = top_anno_data, 
  name = "top_anno", 
  col = annocolors, 
  show_annotation_name = T, 
  na_col = "grey70", 
  show_legend = F,
  simple_anno_size_adjust = T, 
  height = unit(top_height, "inches"), 
  annotation_name_gp = gpar(fontsize = annotation_font_size)
)
names(top_anno) <- top_anno_names[names(top_anno)]

bot_anno <- HeatmapAnnotation(
  df = bot_anno_data, 
  name = "bot_anno", 
  col = annocolors, 
  show_annotation_name = T, 
  na_col = "white", 
  show_legend = F,
  simple_anno_size_adjust = T, 
  height = unit(onco_height * annotation_height_frac, "inches"),
  annotation_name_gp = gpar(fontsize = annotation_font_size)
)
names(bot_anno) <- bot_anno_names[names(bot_anno)]


# browser()
names(col) <- gsub("_", " ", names(col))
oncomat.plot <- gsub("_", " ", oncomat.plot)

col_split_idx <- hm_anno_info$hdv
onco_base_default <- oncoPrint(
  oncomat.plot,
  alter_fun = alter_fun, 
  col = col, 
  row_order = 1:nrow(oncomat.plot),
  name = "oncoplot",
  show_pct = F,
  top_annotation = top_anno,
  bottom_annotation = bot_anno,
  row_split = split_idx,
  left_annotation = rowAnnotation(Reason = split_idx, col = split_colors, annotation_width = unit(0.2, "mm")),
  row_title = NULL,
  column_title = NULL,
  column_order = plot_order,
  column_gap = unit(0.01, "npc"),
  column_split = col_split_idx,
  width = 1
)

# browser()
save_name <- paste0(out_dir, "/oncoplot.pdf")

pdf(file = save_name, height = onco_height, width = onco_width)
draw(tcga_comparison_hm + onco_base_default, main_heatmap = 2)

class_labels <- hm_anno_info$class[match(plot_order, rownames(hm_anno_info))]
class_labels[is.na(class_labels)] <- "NA"
class_labels <- factor(class_labels)

slice_labels <- hm_anno_info$hdv[match(plot_order, rownames(hm_anno_info))]
slice_labels[is.na(slice_labels)] <- "NA"
slice_labels <- factor(slice_labels)

myclasses <- split(class_labels, slice_labels)
label_idx <- lapply(myclasses, function(x) {
  # x=myclasses[[1]]
  change_idx <- c(which(as.logical(diff(as.numeric(x)))), length(x))
  change_idx <- change_idx / length(x)
  curr_label_idx <- rep(1, length(change_idx))
  for (i in 1:length(change_idx)) {
    prev_start <- ifelse(length(change_idx[i - 1]) > 0, change_idx[i - 1], 0)
    curr_label_idx[i] <- (change_idx[i] - prev_start) / 2 + prev_start
  }
  mynames <- ifelse(diff(c(0, change_idx)) < 0.1, "", levels(x))
  names(curr_label_idx) <- mynames
  return(curr_label_idx)
})

## Add cluster labels to top annotation
for (slice_num in 1:length(label_idx)) {
  for (class_num in 1:length(label_idx[[slice_num]])) {
    labeltxt <- names(label_idx[[slice_num]])[class_num]
    labelpos <- label_idx[[slice_num]][class_num]
    decorate_annotation("Class", slice = slice_num, {
      grid.text(labeltxt, labelpos,
        0.5,
        default.units = "npc", gp = gpar(fontsize = annotation_font_size * 0.9, fontface = "italic", col = "white")
      )
    })
  }
}

dev.off()


####  TABLE containing data for each variant
output_data <- maf.filter@data[maf.filter@data$Hugo_Symbol %in% onco_genes, ]
output_data$tumor_genotype <- apply(output_data[, c("Tumor_Seq_Allele1", "Tumor_Seq_Allele2")], 1, paste, collapse = "/")
output_data$normal_genotype <- apply(output_data[, c("Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2")], 1, paste, collapse = "/")

pheno_info <- sample_info.exome[match(output_data$Tumor_Sample_Barcode, sample_info.exome$Tumor_Sample_Barcode), ]
pheno_info <- cbind(pheno_info[, "Tumor_Sample_Barcode"], pheno_info[, -c("Tumor_Sample_Barcode")])
pheno_columns <- colnames(pheno_info)
names(pheno_columns) <- make.names(pheno_columns, unique = T)

output_data <- cbind(output_data, pheno_info)
cols_for_table <- c(
  "Hugo Symbol" = "Hugo_Symbol",
  "Variant Classification" = "Variant_Classification",
  "Variant Type" = "Variant_Type",
  "Consequence" = "Consequence",
  "Chromosome" = "Chromosome", "Start Position" = "Start_Position", "End Position" = "End_Position", "Strand" = "Strand",
  "Reference Allele" = "Reference_Allele",
  "Tumor Genotype" = "tumor_genotype",
  "Normal Genotype" = "normal_genotype",
  "Transcript Change" = "HGVSc",
  "Protein Change" = "HGVSp_Short",
  "Normal Depth" = "n_depth",
  "Normal Ref Depth" = "n_ref_count",
  "Normal Alt Depth" = "n_alt_count",
  "Tumor Depth" = "t_depth",
  "Tumor Ref Depth" = "t_ref_count",
  "Tumor Alt Depth" = "t_alt_count",
  "Existing Annotation" = "Existing_variation",
  "gnomAD Frequency" = "gnomAD_AF",
  "ExAC Frequency" = "ExAC_AF",
  "1000Genomes Frequency" = "AF",
  "Current Cohort Frequency" = "tumor_freq",
  pheno_columns
)

variant_info <- as.data.frame(output_data)[, cols_for_table]
colnames(variant_info) <- names(cols_for_table)
write.xlsx(variant_info, file = paste0(out_dir, "/Table_2.xlsx"))

tcga.white <- tcga_comparison_results[[2]]
tcga.asian <- tcga_comparison_results[[3]]
tcga_lihc_mc3 <- tcga_comparison_results[[4]]
all_dfs <- list(
  driver_res_plus,
  data.frame(
    MONG_frac_mut = (maf.filter@gene.summary$MutatedSamples / as.numeric(maf.filter@summary$summary[3])),
    Hugo_Symbol = maf.filter@gene.summary$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(
    TCGA_All_frac_mut = (tcga_lihc_mc3@gene.summary$MutatedSamples / as.numeric(tcga_lihc_mc3@summary$summary[3])),
    Hugo_Symbol = tcga_lihc_mc3@gene.summary$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(
    TCGA_Asian_frac_mut = (tcga.asian@gene.summary$MutatedSamples / as.numeric(tcga.asian@summary$summary[3])),
    Hugo_Symbol = tcga.asian@gene.summary$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(
    TCGA_White_frac_mut = (tcga.white@gene.summary$MutatedSamples / as.numeric(tcga.white@summary$summary[3])),
    Hugo_Symbol = tcga.white@gene.summary$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(maf.filter@gene.summary)
)


all_dfs_merged <- all_dfs %>%
  Reduce(function(dtf1, dtf2) left_join(dtf1, dtf2, by = "Hugo_Symbol"), .)

write.xlsx(all_dfs_merged, file = paste0(out_dir, "/gene_mutsig_info.xlsx")) # ,asTable = T)
image.png

参考

  • The genomic landscape of Mongolian hepatocellular carcinoma
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 206,214评论 6 481
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 88,307评论 2 382
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 152,543评论 0 341
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 55,221评论 1 279
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 64,224评论 5 371
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 49,007评论 1 284
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,313评论 3 399
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,956评论 0 259
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 43,441评论 1 300
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,925评论 2 323
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,018评论 1 333
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,685评论 4 322
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,234评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,240评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,464评论 1 261
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,467评论 2 352
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,762评论 2 345

推荐阅读更多精彩内容