pinf = float('inf') #正无穷大
ninf = float('-inf') #负无穷大
def mono_bin(Y, X, n = 20):
r = 0
bad=Y.sum() # 1表示坏客户
good=Y.count()-bad
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
d2 = d1.groupby('Bucket', as_index = True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min'] = d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['bad_rate'] = d2.mean().Y
d3['group_rate'] = d3['total'] / (bad + good)
# d3['badattribute']=d3['sum']/bad
#d3['goodattribute']=(d3['total']-d3['sum'])/good
d3['woe'] = np.log((1-d3['bad_rate'])/ d3['bad_rate'] / (good / bad)) #好客户/坏客户
d3['iv'] = (((d3['total'] - d3['sum']) / good) - d3['sum']/bad) * d3['woe']
#d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good)) #坏客户/好客户
#d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv = d3['iv'].sum()
d3['iv_sum'] = iv
d4 = (d3.sort_index(by='min'))
print("=" * 90)
print(d4)
cut=[]
cut.append(float('-inf'))
for i in range(1,n+1):
qua=X.quantile(i/(n+1))
cut.append(round(qua,4))
cut.append(float('inf'))
woe=list(d4['woe'].round(3))
return d4,iv,cut,woe
#自定义分箱函数
def self_bin(Y,X,cat):
bad=Y.sum()
good=Y.count()-bad
d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,cat)})
d2=d1.groupby('Bucket', as_index = True)
d3 = pd.DataFrame(d2.X.min(), columns=['min'])
d3['min'] = d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['bad_rate'] = d2.mean().Y
d3['group_rate'] = d3['total'] / (bad + good)
#d3['badattribute']=d3['sum']/bad
#d3['goodattribute']=(d3['total']-d3['sum'])/good
d3['woe'] =np.log((1 - d3['bad_rate']) / d3['bad_rate'] / (good / bad))
d3['iv'] = (((d3['total'] - d3['sum']) / good) - d3['sum'] / bad) * d3['woe']
#d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
#d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv = d3['iv'].sum()
d3['iv_sum'] = iv
d4 = (d3.sort_index(by='min'))
print("=" * 90)
print(d4)
woe = list(d4['woe'].round(3))
return d4,iv,woe
def cate_woe(Y,X):
bad=Y.sum()
good=Y.count()-bad
d1 = pd.DataFrame({"X": X, "Y": Y})
d2 = d1.groupby('X', as_index =True)
d3 = pd.DataFrame()
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
for c in range(d3.shape[0])[::-1]:
if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
d3.drop(d3.index[c],inplace=True)
else:
continue
d3['bad_rate'] =d3['sum']/d3['total']
d3['group_rate']=d3['total']/(bad+good)
#d3['badattribute']=d3['sum']/bad
#d3['goodattribute']=(d3['total']-d3['sum'])/good
d3['woe']=np.log((1 - d3['bad_rate']) / d3['bad_rate'] / (good / bad))
d3['iv']=(((d3['total'] - d3['sum']) / good) - d3['sum'] / bad) * d3['woe']
#d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
#d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
d3['iv_sum']=iv
d4 = (d3.sort_index(by='woe'))
woe = list(d4['woe'].round(3))
#woe = list(round(d4['woe'],2))
print("=" * 90)
print(d4)
#d3=d3[['sum','total','bad_rate','group_rate','woe','iv','iv_sum']]
return d4,iv,woe
dfx1, ivx1,cutx1,woex1=mono_bin(df['因变量'],df['变量'],n=10) #最优分箱
cutx2 = [ninf,1,pinf]
dfx2, ivx2,woex2 = self_bin(df['因变量'], df['变量'],cutx2) #手动分箱
woex2.plot.bar(color='b',alpha=0.3,rot=0) #查看单调性
也可以参考这两篇文章: