本文章数据截取自一家银行的真实客户与交易数据;涉及客户主记录、帐号、交易、业务和信用卡数据;下载地址:https://pan.baidu.com/s/1mjg8OGS
Note:
本文同时用了dplyr包(处理速度很快)和sqldf包(sql语句写起来比较方便,但是速度没有dplyr快)的方法进行数据整理
导入数据
library(tidyverse)
library(sqldf)
accounts <- read_csv("accounts.csv")
clients <- read_csv("clients.csv")
disp <- read_csv("disp.csv")
loans <- read_csv("loans.csv")
trans <- read_csv("trans.csv")
……………………………………………………
使用“loans”数据,根据“status”变量生成违约标识变量(bad_good),
其中当等于status=A时,#取值为bad_good=0,
其中当等于status in (B,D)时,取值为bad_good=1,
等于status=C时,bad_good为缺失值。*/
A代表合同终止,没问题;B代表合同终止,贷款没有支付;
C代表合同处于执行期,至今正常;D代表合同处于执行期,
欠债状态。
loans <- mutate(loans,bad_good=ifelse(status %in% c("B","D"),1,ifelse(status =="A",0,NA_integer_)))
********************************************************
是否违约与借款人的年龄是否有关系?
借款人的年龄信息在clients表中,所以要将两个表连接起来
#dplyr方法
dpl_data <- left_join(loans,disp,by=c("account_id")
) %>% left_join(y=clients,by=c("client_id")
) %>% filter(type=="所有者") %>% select(-one_of(c("disp_id","type")))
#SQL方法
sql_data <- sqldf("select a.*,c.sex,c.birth_date,c.district_id,
c.client_id from loans as a
left join disp as b on a.account_id=b.account_id
left join clients as c on b.client_id=c.client_id
where b.type='所有者'")
setequal(dpl_data,sql_data)
dpl_data <- mutate(dpl_data,age=floor((as.Date(dpl_data$date)-as.Date(dpl_data$birth_date))/365))
aggregate(dpl_data$age,by=list(dpl_data$bad_good),mean)
Group.1 x
1 0 36.60591
2 1 37.55263
从图中可以简单看出年龄在不同好坏状态下的分布相差不大,相关性应该不大,不过还要进行假设检验才能得出结论
ggplot(dpl_data,mapping = aes(x=as.factor(bad_good),y=age))+geom_boxplot()
*****************************************************************
是否违约与借款人的资产否有关系?
因为账户余额在“trans”表中,所以要将loans表和trans表连接
dpl <- left_join(loans,trans,by="account_id",suffix = c("_x", "_y")) %>% arrange(account_id,date_y) %>% select(
-one_of(c("trans_id","type","operation","amount_y","k_symbol","bank","account" ))
) %>% rename(date=date_x,amount=amount_x)
data2<-sqldf("select a.*,b.balance,b.date as t_date
from loans as a
left join trans as b on a.account_id=b.account_id
order by a.account_id,t_date")
dpl$date2 <- as.Date(dpl$date)
dpl$date2_y <- as.Date(dpl$date_y)
dpl$balance2 <- as.numeric(substr(gsub(",","",dpl$balance),2,nchar(dpl$balance)))
dpl <- select(dpl,-one_of("date","date_y","balance"))
只需要贷款前一年内的账户余额
dpl2 <- filter(dpl,date2>date2_y & date2<=date2_y+365)
dpl4<-select(dpl2,account_id,status,amount,balance2) %>%
group_by(account_id,amount,status) %>%
summarize(avg_balance=mean(balance2,na.rm = T),stdev_balance2=sd(balance2,na.rm = T))%>%
arrange(account_id)
data4<-sqldf("select a.account_id,a.status,a.amount,
avg(balance2) as avg_balance,
stdev(balance2) as stdev_balance2
from dpl2 as a
group by a.account_id
order by a.account_id")
dpl4$bad_good<-ifelse(dpl4$status=="B"
| dpl4$status=="D",1,
ifelse(dpl4$status=="A",0,NA))
dpl4$bad_good <- as.factor(dpl4$bad_good)
资产高低和违约的可能性是否有关系?
从图可以看出资产低的客人违约的可能性较高
with(dpl4,{
aggregate(avg_balance,by=list(bad_good),mean)
})
Group.1 x
1 0 42499.40
2 1 34642.31
ggplot(dpl4,mapping = aes(x=bad_good,y=avg_balance))+geom_boxplot()
贷款数额高的客人违约的可能性较高
with(dpl4,{
tapply(amount,bad_good,mean)
})
0 1
91641.46 205002.00
ggplot(dpl4,mapping = aes(x=bad_good,y=amount))+geom_boxplot()
贷款额度超出资产越高的客人违约的可能性较高
dpl4 <- mutate(dpl4,rate=amount/avg_balance)
with(dpl4,{tapply(rate,bad_good,mean)})
ggplot(dpl4,mapping = aes(x=bad_good,y=rate))+geom_boxplot()
资产波动越高的客人违约的可能性较高,不过不明显
with(dpl4,{tapply(stdev_balance2,bad_good,mean)})
ggplot(dpl4,mapping = aes(x=bad_good,y=stdev_balance2))+geom_boxplot()