#第 2章 代码开始
> ## 利用 nnet 包中的函数multinom ,建立多元logistic回归模型:
> library(foreign)
> library(nnet)
> library(ggplot2)
> library(reshape2)
> ml <- read.dta("hsbdemo.dta")#读取.dta格式数据
> #View(ml)#查看数据的具体形式
> with(ml, table(ses, prog))#ses行和prog列的表格
prog
ses general academic vocation
low 16 19 12
middle 20 44 31
high 9 42 7
> #with在数据框上的操作
> #do.call(操作,数据)
> with(ml, do.call(rbind, tapply(write, prog, function(x) c(M = mean(x), SD = sd(x)))))
M SD
general 51.33333 9.397775
academic 56.25714 7.943343
vocation 46.76000 9.318754
> ml$prog2 <- relevel(ml$prog, ref = "academic")#以academic作为参考
> test <- multinom(prog2 ~ ses + write, data = ml)#回归
# weights: 15 (8 variable)
initial value 219.722458
iter 10 value 179.982880
final value 179.981726
converged
> summary(test)
Call:
multinom(formula = prog2 ~ ses + write, data = ml)
Coefficients:
(Intercept) sesmiddle seshigh write
general 2.852198 -0.5332810 -1.1628226 -0.0579287
vocation 5.218260 0.2913859 -0.9826649 -0.1136037
#行general 和vocation是指相对于academic的值,列sesmiddle seshigh是指相对于seslow的值
Std. Errors:
(Intercept) sesmiddle seshigh write
general 1.166441 0.4437323 0.5142196 0.02141097
vocation 1.163552 0.4763739 0.5955665 0.02221996
Residual Deviance: 359.9635
AIC: 375.9635
> # 2-tailed z test
> z <- summary(test)$coefficients/summary(test)$standard.errors #Z值的计算
> z
(Intercept) sesmiddle seshigh write
general 2.445214 -1.2018081 -2.261334 -2.705562
vocation 4.484769 0.6116747 -1.649967 -5.112689
> p <- (1 - pnorm(abs(z), 0, 1)) * 2##P值的计算
> p
(Intercept) sesmiddle seshigh write
general 0.0144766100 0.2294379 0.02373856 6.818902e-03
vocation 0.0000072993 0.5407530 0.09894976 3.176045e-07
#general 相对于academic,因素seshigh和write是有意义的
#vocation 相对于academic,因素write是有意义的
> # extract the coefficients from the model and exponentiate
> exp(coef(test)) #OR值
(Intercept) sesmiddle seshigh write
general 17.32582 0.5866769 0.3126026 0.9437172
vocation 184.61262 1.3382809 0.3743123 0.8926116
#解释OR值,write每提高一个单位,产生general是产生academic的0.9437172倍
#write每提高一个单位,产生vocation是产生academic的0.8926116倍
#
> head(pp <- fitted(test))
academic general vocation
1 0.1482764 0.3382454 0.5134781
2 0.1202017 0.1806283 0.6991700
3 0.4186747 0.2368082 0.3445171
4 0.1726885 0.3508384 0.4764731
5 0.1001231 0.1689374 0.7309395
6 0.3533566 0.2377976 0.4088458
> dses <- data.frame(ses = c("low", "middle", "high"), write = mean(ml$write))
#这里把写作write取均值,即,每个人的写作贡献固定了,只有社会经济地位的变化了
> predict(test, newdata = dses, "probs")#预测
academic general vocation
1 0.4396845 0.3581917 0.2021238
2 0.4777488 0.2283353 0.2939159
3 0.7009007 0.1784939 0.1206054
> dwrite <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70),3))
> # store the predicted probabilities for each value of ses and write
> pp.write <- cbind(dwrite, predict(test, newdata = dwrite, type = "probs", se = TRUE))
> # calculate the mean probabilities within each level of ses
> by(pp.write[, 3:5], pp.write$ses, colMeans)
pp.write$ses: high
academic general vocation
0.6164315 0.1808037 0.2027648
--------------------------------------------------------
pp.write$ses: low
academic general vocation
0.3972977 0.3278174 0.2748849
--------------------------------------------------------
pp.write$ses: middle
academic general vocation
0.4256198 0.2010864 0.3732938
> #melt data set to long for ggplot2
> lpp <- melt(pp.write, id.vars = c("ses", "write"), value.name = "probability")
> head(lpp) # view first few rows
ses write variable probability
1 low 30 academic 0.09843588
2 low 31 academic 0.10716868
3 low 32 academic 0.11650390
4 low 33 academic 0.12645834
5 low 34 academic 0.13704576
6 low 35 academic 0.14827643
> # plot predicted probabilities across write values for each level of ses
> # facetted by program type
> ggplot(lpp, aes(x = write, y = probability, colour = ses)) + geom_line() + facet_grid(variable ~., scales = "free")
> install.packages("mlogit")
> library(Formula)
> library(maxLik)
> library(miscTools)
> library(mlogit)
> data("Fishing", package = "mlogit")
> Fish <- mlogit.data(Fishing,shape = "wide",choice = "mode")
> summary(mlogit(mode ~ 0|income, data = Fish))
Call:
mlogit(formula = mode ~ 0 | income, data = Fish, method = "nr")
Frequencies of alternatives:choice
beach boat charter pier
0.11337 0.35364 0.38240 0.15059
nr method
4 iterations, 0h:0m:0s
g'(-H)^-1g = 8.32E-07
gradient close to zero
Coefficients :
Estimate Std. Error z-value Pr(>|z|)
(Intercept):boat 7.3892e-01 1.9673e-01 3.7560 0.0001727 ***
(Intercept):charter 1.3413e+00 1.9452e-01 6.8955 5.367e-12 ***
(Intercept):pier 8.1415e-01 2.2863e-01 3.5610 0.0003695 ***
income:boat 9.1906e-05 4.0664e-05 2.2602 0.0238116 *
income:charter -3.1640e-05 4.1846e-05 -0.7561 0.4495908
income:pier -1.4340e-04 5.3288e-05 -2.6911 0.0071223 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Log-Likelihood: -1477.2
McFadden R^2: 0.013736
Likelihood ratio test : chisq = 41.145 (p.value = 6.0931e-09)
2.2章 有序分类logistic回归 即 等级资料
有可能把等级资料转换成二分类资料就转换,否则只能当作等级资料或多分类资料;
是否从学校毕业:不可能,有可能,极有可能;父母是否有学位:1表示有,0表示无;公立还是私立学校:1公里,0私立
## 程序包MASS提供polr()函数可以进行ordered logit或probit回归
require(foreign)
require(ggplot2)
require(MASS)
require(Hmisc)
require(reshape2)
dat <- read.dta("ologit.dta")
#View(dat)
head(dat)
## one at a time, table apply, pared, and public
lapply(dat[, c("apply", "pared", "public")], table)
## three way cross tabs (xtabs) and flatten the table
ftable(xtabs(~ public + apply + pared, data = dat))
summary(dat$gpa)
sd(dat$gpa)
ggplot(dat, aes(x = apply, y = gpa)) +
geom_boxplot(size = .75) +
geom_jitter(alpha = .5) +
facet_grid(pared ~ public, margins = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
## fit ordered logit model and store results 'm'
m <- polr(apply ~ pared + public + gpa, data = dat, Hess=TRUE)
## view a summary of the model
summary(m)
## store table
(ctable <- coef(summary(m)))
## calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
## combined table
(ctable <- cbind(ctable, "p value" = p))
(ci <- confint(m)) # default method gives profiled CIs
confint.default(m) # CIs assuming normality
## odds ratios
exp(coef(m))
## OR and CI
exp(cbind(OR = coef(m), ci))
sf <- function(y) {
c('Y>=1' = qlogis(mean(y >= 1)),
'Y>=2' = qlogis(mean(y >= 2)),
'Y>=3' = qlogis(mean(y >= 3)))
}
(s <- with(dat, summary(as.numeric(apply) ~ pared + public + gpa, fun=sf)))
glm(I(as.numeric(apply) >= 2) ~ pared, family="binomial", data = dat)
glm(I(as.numeric(apply) >= 3) ~ pared, family="binomial", data = dat)
s[, 4] <- s[, 4] - s[, 3]
s[, 3] <- s[, 3] - s[, 3]
s # print
plot(s, which=1:3, pch=1:3, xlab='logit', main=' ', xlim=range(s[,3:4]))
newdat <- data.frame(
pared = rep(0:1, 200),
public = rep(0:1, each = 200),
gpa = rep(seq(from = 1.9, to = 4, length.out = 100), 4))
newdat <- cbind(newdat, predict(m, newdat, type = "probs"))
##show first few rows
head(newdat)
lnewdat <- melt(newdat, id.vars = c("pared", "public", "gpa"),
variable.name = "Level", value.name="Probability")
## view first few rows
head(lnewdat)
ggplot(lnewdat, aes(x = gpa, y = Probability, colour = Level)) +
geom_line() + facet_grid(pared ~ public, labeller="label_both")