通过等宽分箱得得方法对df连续型数值进行离散化
def binning(x, n=10):
d1=pd.DataFrame({'x':x, 'bucket':pd.cut(x, n)})
d2=d1.groupby('bucket', as_index=True)
d3=pd.DataFrame(d2.x.min(), columns=['min'])
d3['min']=d2.x.min()
d3['max']=d2.x.max()
d3['total']=d2.x.count()
d3['proba']=d2.x.count()/len(x)
d4=(d3.sort_values(by='min')).reset_index(drop=True)
return d4
#replace the values with probability
def replace_proba(value,cut,proba):
for i in range(len(cut)):
if value>cut[i]:
continue
else:
return proba[i]
#get the joint probability distribution
def risk_score(row_var):
s=0
for p in row_var:
s+=np.power(np.log(p),2)
return np.exp(np.power(s/len(row_var), 0.5))
使用:
bin_col=binning(df[col])
df[col]=df[col].apply(replace_proba, args=(bin_col['max'], bin_col['proba'])