代码
import pandas as pd
import xgboost as xgb
import operator
def get_data():
train = pd.read_csv("first_result2.csv")
#这里我只有12个特征
features = list(train.columns[:11])
y_train = train['target']
#数据缺失值补全
for feat in train.select_dtypes(include=['object']).columns:
m = train.groupby([feat])['target'].mean()
train[feat].replace(m,inplace=True)
x_train = train[features]
return x_train, y_train
x_train, y_train = get_data()
#这里的参数自己改
xgb_params = {'booster':'gbtree','objective': 'binary:logistic', "eta": 0.01, "max_depth": 5, "silent": 0,"colsample_bytree":0.7}
num_rounds = 1000
dtrain = xgb.DMatrix(x_train, label=y_train)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)
importance = gbdt.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
print importance
结果
[('gender', 578), ('is_sys', 1202), ('is_font_cem', 1448), ('is_sup_cem', 1507), ('ite_phone_num', 1669), ('is_dou_kard', 1729), ('is_auto', 1796), ('age', 2235), ('now_month', 2596), (' avg_flow', 2914), ('avr_cost', 4149)]