R语言进行相关性分析

cor函数

Pearson、Spearman、Kendall相关系数都可以通过cor函数实现，cov协方差函数参数同cor函数。

（1）用法

cor(x,y=NULL,use="everything",method= c("pearson","kendall","spearman"))
cor(x, use='everything', method='pearson')  #计算矩阵相关系数 
cor(mtcars$mpg, mtcars$cyl)  #计算两两相关系数

x：矩阵或数据框。
use：指定缺失数据的处理方式。可选项：all.obs（假设不存在缺失数据）、everything（数据存在缺失值时，相关系数计算结果会显示missing）、complete.obs（行删除）、pairwise.complete.obs（成对删除）。
method：指定相关系数的类型。可选类型为pearson、spearman、kendall。

（2）R Script

> testdata1[1:5,1:5] #随意找的数据
              CF1      CF2      CF3      CF4       CM1
rna13468 66.97984 80.07318 54.87525 91.65463  1.401584
rna885   26.53467 44.51450 33.65076 40.72113 60.633389
rna32332  0.00000 37.71825 11.89813  0.00000  1.403419
rna8744  42.44415 52.31791 60.35968 54.00533 39.524090
rna16488  0.00000  0.00000  0.00000  0.00000  0.000000
> cor_data <- cor(testdata1,method="pearson")
> round(cor_data[1:5,1:5],3)
       CF1    CF2    CF3    CF4    CM1
CF1  1.000  0.668  0.697  0.952 -0.129
CF2  0.668  1.000  0.923  0.664 -0.534
CF3  0.697  0.923  1.000  0.647 -0.386
CF4  0.952  0.664  0.647  1.000 -0.203
CM1 -0.129 -0.534 -0.386 -0.203  1.000
> cor_data <- cor(testdata1,method="spearman")
> round(cor_data[1:5,1:5],3)
       CF1    CF2    CF3    CF4    CM1
CF1  1.000  0.728  0.706  0.904 -0.055
CF2  0.728  1.000  0.785  0.734 -0.373
CF3  0.706  0.785  1.000  0.605 -0.212
CF4  0.904  0.734  0.605  1.000 -0.182
CM1 -0.055 -0.373 -0.212 -0.182  1.000

corrplot

（1）用法

corrplot(corr,  #相关性系数矩阵 
    method = c("circle", "square", "ellipse", "number", "shade", "color", "pie"), 
#可视化的方法，可以是圆形、方形、椭圆形、数值、阴影、颜色或饼图形 
    type = c("full", "lower", "upper"), 
#指定展示的方式，可以是完全的、下三角或上三角 
    add = FALSE, 
    col = NULL,  #指定图形展示的颜色，默认以均匀的颜色展示 
    bg = "white",  #背景色 
    title = "",  #标题 
    is.corr = TRUE,  #是否为相关系数绘图 
    diag = TRUE,  #是否展示对角线上的结果 
    outline = FALSE,  #是否绘制圆形、方形或椭圆形的轮廓 
    mar = c(0,0,0,0),  #设置图形的四边间距 
    addgrid.col = NULL, 
#当选择的方法为颜色或阴影时，默认的网格线颜色为白色，否则为灰色 
    addCoef.col = NULL, 
#为相关系数添加颜色，默认不添加相关系数，只有方法为number时，该参数才起作用 
    addCoefasPercent = FALSE,  #是否将相关系数转换为百分比格式 
    order = c("original", "AOE", "FPC", "hclust", "alphabet"), 
#指定相关系数排序的方法，可以是原始顺序original、特征向量角序AOE、第一主成分顺序FPC、
#层次聚类顺序hclust和字母顺序，一般AOE排序结果都比FPC要好 
    hclust.method = c("complete", "ward", "single", "average", 
                      "mcquitty", "median", "centroid"), 
#当order为hclust时，该参数可以是层次聚类中的7种之一 
    addrect = NULL,  #当order为hclust时，可以为添加相关系数图添加矩形框 
    rect.col = "black",  #指定矩形框的颜色 
    rect.lwd = 2,  #指定矩形框的线宽
    tl.pos = NULL, 
#指定文本标签(变量名称)的位置，当type=full时，默认标签位置在左边和顶部(lt)，
#当type=lower时，默认标签在左边和对角线(ld)，当type=upper时，默认标签在顶部和对角线，
#d表示对角线，n表示不添加文本标签 
    tl.cex = 1,  #指定文本标签的大小 
    tl.col = "red",  #指定文本标签的颜色 
    tl.offset = 0.4, tl.srt = 90, 
    cl.pos = NULL, 
#图例（颜色）位置，当type=upper或full时，图例在右侧，当type=lower时，图例在底部，
#不需要图例时，只需指定该参数为n 
    cl.lim = NULL, 
    cl.length = NULL, cl.cex = 0.8, cl.ratio = 0.15, 
    cl.align.text = "c",cl.offset = 0.5, 
    addshade = c("negative", "positive", "all"), 
#只有当method=shade时，该参数才有用，参数值可以是negative/positive和all，分别表示对负相关系数、
#正相关系数和所有相关系数添加阴影。注意：正相关系数的阴影是45度，负相关系数的阴影是135度
    shade.lwd = 1,  #指定阴影的线宽 
    shade.col = "white",  #指定阴影线的颜色 
    p.mat = NULL, sig.level = 0.05, 
    insig = c("pch","p-value","blank", "n"), 
    pch = 4, pch.col = "black", pch.cex = 3, 
    plotCI = c("n","square", "circle", "rect"), 
    lowCI.mat = NULL, uppCI.mat = NULL, ...)

（2）R Script

library(corrplot)
##默认参数
corrplot(cor_data)

image

##可视化方法
#"circle", "square", "ellipse", "number", "shade", "color", "pie"
corrplot(cor_data, method="pie",title="method=pie")

image

##展示的方式
#"full", "lower", "upper"
corrplot(cor_data, type="upper",title="type=upper")

image

##混合图形样式
#corrplot.mixed（matrix,lower="number",upper="circle")
#tl.col修改对角线的颜色,lower.col修改下三角的颜色,number.cex修改下三角字体大小
corrplot.mixed(cor_data,lower="ellipse",upper="pie")
corrplot.mixed(cor_data,lower="number",upper="pie", 
               tl.col="green",lower.col="skyblue",number.cex=1)

image

##order
#"original", "AOE", "FPC", "hclust", "alphabet"
#如果是hclust:
#addrect=4 是分组矩形
#rect.col = "black" 矩形框的颜色 
#rect.lwd = 2 矩形框的线宽
#hclust.method = c("complete", "ward", "single", "average", 
#"mcquitty", "median", "centroid")
corrplot(cor_data,order="hclust",hclust.method="average",addrect=4)
corrplot(cor_data,order="AOE")

image

##颜色
col1 <- colorRampPalette(c("blue","white","red"))
corrplot(cor_data,order="hclust",addrect=4,
         col=col1(100),
         bg="khaki1",addgrid.col="green",
         tl.col="purple",tl.cex=0.7)

image

##添加数字
corrplot(cor_data,method="color",order="hclust",addrect=4,
         col=col1(100),
         tl.col="black",addCoef.col="grey",addCoefasPercent=T)

image

ggcorrplot

ggcorrplot包内只有2个函数，一个cor_pmat()用于计算p值，一个ggcorrplot()用于绘图。ggcorrplot相当于精简版的corrplot包，只有主题更加丰富多样。

（1）用法

ggcorrplot(corr, method = c("square", "circle"), type = c("full", "lower", "upper"), 
  ggtheme = ggplot2::theme_minimal, title = "",
  show.legend = TRUE, legend.title = "Corr", 
  show.diag = FALSE, 
  colors = c("blue", "white", "red"), outline.color = "gray",
  hc.order = FALSE, hc.method = "complete", 
  lab = FALSE, lab_col = "black", lab_size = 4, p.mat = NULL, sig.level = 0.05,
  insig = c("pch", "blank"), pch = 4, pch.col = "black", pch.cex = 5,
  tl.cex = 12, tl.col = "black", tl.srt = 45, digits = 2 )

（2）R Script

library(ggcorrplot)
##计算p值
cor_p <- cor_pmat(cor_data)
round(cor_p[1:5,1:5],3)
##默认绘图square
ggcorrplot(cor_data)
##可视化方法
ggcorrplot(cor_data,method="circle")
##使用聚类顺序
ggcorrplot(cor_data,hc.order=TRUE,outline.color="white")

image

##展示的方式
ggcorrplot(cor_data,hc.order=TRUE,outline.color="white",
           type="lower")
##更改颜色
ggcorrplot(cor_data,hc.order=TRUE,outline.color="white",
           type="lower",colors = c("#6D9EC1", "white", "#E46726"))
##更改主题
ggcorrplot(cor_data,hc.order=TRUE,outline.color="white",
           type="lower",colors = c("#6D9EC1", "white", "#E46726"),
           ggtheme = ggplot2::theme_void())

image

#添加相关系数
ggcorrplot(cor_data,hc.order=TRUE,outline.color="white",
           type="lower",colors = c("#6D9EC1", "white", "#E46726"),
           lab = TRUE)
#不显著的画x
ggcorrplot(cor_data,hc.order=TRUE,outline.color="white",
           type="lower",colors = c("#6D9EC1", "white", "#E46726"),
           p.mat = cor_p)

image

ggcorr

（1）R Script

##计算相关系数
ggcorr(testdata1,method=c("pairwise","spearman"))
##指定颜色标度中包含的断点数
ggcorr(testdata1,method=c("pairwise","spearman"),
       nbreaks = 5)
##设置图例
ggcorr(testdata1,method=c("pairwise","spearman"),
       name="12345", legend.position="bottom", legend.size=12) +
  guides(fill=guide_colorbar(barwidth=18, title.vjust=0.75)) +
  theme(legend.title=element_text(size=14))

image

##设置颜色
ggcorr(testdata1,method=c("pairwise","spearman"),
       low="steelblue", mid="white", high="darkred")
##画圆形
ggcorr(testdata1,method=c("pairwise","spearman"),
       geom = "circle",min_size=2,max_size=6)
##添加相关系数
ggcorr(testdata1,method=c("pairwise","spearman"),
       label=TRUE,label_size=3,label_color="white")

image

##控制变量标签
ggcorr(testdata1,method=c("pairwise","spearman"),
       hjust=0.75, size=5, color="grey50",layout.exp=1)
##显示较高的相关系数
ggcorr(testdata1,method=c("pairwise","spearman"),
       label=TRUE, hjust=0.75,geom="blank") +
  geom_point(size=10, aes(color=coefficient>0, 
                          alpha=abs(coefficient)>0.5)) +
  scale_alpha_manual(values=c("TRUE"=0.25,"FALSE"=0)) +
  guides(color=FALSE,alpha=FALSE)

image

样品间相似性（similarity）和距离（distance）

（1）表示距离的方法

欧式距离（Euclidean Distance）
√(a^2+b2+c^2)
dist(t(x),p=2)
曼哈顿距离（Manhattan Distance）
|a1-a2|+|b1-b2|+|c1-c2|
dist(t(x),"manhattan")
切比雪夫距离（Chebyshev Distance）
max(|a1-a2|,|b1-b2|,|c1-c2|)
dist(t(x),"maximum")
闵可夫斯基距离（Minkowski Distance）
dist(t(x),"minkowski")
标准化欧氏距离（Standardized Euclidean distance）
先将数据各维分量标准化到均值方差相等，即(x-μ)/δ，标准化后的值=(标准化前的值-分量的均值)/分量的标准差。
x1 = scale(t(x), center=T,scale=T)
dist(x1)
马氏距离（Mahalanobis Distance）
兰式距离
dist(t(x), method = "canberra")
夹角余弦（Cosine）
汉明距离（Hamming distance）
两个等长字符串s1与s2之间的汉明距离定义为将其中一个变为另外一个所需要作的最小替换次数。
x <- c(1, 0, 0)
y <- c(1, 0, 1)
hamming.distance(x, y) #1
杰卡德相似系数（Jaccard similarity coefficient）
dist(t(x), method = "Jaccard")
相关系数（Correlation coefficient）与相关距离（Correlation distance）
1-cor(x)
信息熵（Information Entropy）
信息熵是衡量分布的混乱程度或分散程度的一种度量。分布越分散（分布越平均），信息熵就越大。分布越有序（分布越集中），信息熵就越小。
kl散度

（2）dist用法

This function computes and returns the distance matrix computed by using the specified distance measure to compute the distances between the rows of a data matrix.
这个函数用特定的方法计算矩阵的行之间的距离，并返回距离矩阵。

dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2)

method：可以是"euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"
diag：是否显示对角线的值
upper：是否显示上三角的值
p：The power of the Minkowski distance

（3）scale用法

scale(x, center = TRUE, scale = TRUE)

scale是对矩阵的每一列进行标准化，如果要对行标准化需要先转置。如heatmapdata <- t(scale(t(heatmapdata)))

（4）R Script

sampleDist <- dist(t(testdata1)) 
sampleDistMatrix <- as.matrix(sampleDist) 
colnames(sampleDistMatrix) <- NULL 
colors <- colorRampPalette(rev(brewer.pal(9,"Blues")))(255) 
pheatmap(sampleDistMatrix, 
         clustering_distance_rows=sampleDist, 
         clustering_distance_cols=sampleDist, 
         color = colors)

image

R相关性分析和相关性热图 - 简书 (jianshu.com)

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 204,684评论 6赞 478
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 87,143评论 2赞 381
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 151,214评论 0赞 337
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,788评论 1赞 277
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,796评论 5赞 368
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,665评论 1赞 281
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,027评论 3赞 399
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,679评论 0赞 258
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 41,346评论 1赞 299
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,664评论 2赞 321
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,766评论 1赞 331
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,412评论 4赞 321
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,015评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,974评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,203评论 1赞 260
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,073评论 2赞 350
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,501评论 2赞 343

R语言进行相关性分析

R语言进行相关性分析

相关性分析

相关性指标

（1）Pearson相关系数（皮尔逊积差相关系数）

（2）Spearman等级相关系数（斯皮尔曼秩相关系数）

（3）Kendall's Tau相关系数

（4）其它

cor函数

（1）用法

（2）R Script

corrplot

（1）用法

（2）R Script

ggcorrplot

（1）用法

（2）R Script

ggcorr

（1）R Script

样品间相似性（similarity）和距离（distance）

（1）表示距离的方法

（2）dist用法

（3）scale用法

（4）R Script

推荐阅读更多精彩内容