搭建模型
melbourne_predictors = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_predictors] # 直接选择要的特征
y = melbourne_data.Price
from sklearn.tree import DecisionTreeRegressor
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(X, y) # 导入dataframe即可
归一化(根据某个数据归一)
train_data = [[0, 0], [0, 0], [1, 1], [1, 1]]
test_data = [[1, 0], [0, 0], [0, 1], [1, 1]]
基于mean和std的标准化
scaler = preprocessing.StandardScaler().fit(train_data) #基于train_data的数据进行缩放
print (scaler.transform(train_data))
print (scaler.transform(test_data))将每个特征值归一化到一个固定范围
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(train_data)
print (scaler.transform(train_data))
print (scaler.transform(test_data))
feature_range: 定义归一化范围,注用()括起来
正则化
X = [[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
print (X_normalized)
独热编码
data = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]
encoder = preprocessing.OneHotEncoder().fit(data)
print(" x0 x1 y0 y1 y2 z0 z1 z2 z3")
print(encoder.transform(data).toarray())
label = LabelEncoder()
for dataset in data_cleaner:
dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
dataset['Title_Code'] = label.fit_transform(dataset['Title'])
dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
数据集划分
from sklearn.model_selection import train_test_split
X = iris.data # 获得其特征向量
y = iris.target # 获得样本label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
交叉验证
cross_val_score(model, X_test, y_test, scoring='accuracy', cv=3, n_jobs=1)
使用评价标准
from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)
不全数据
- 去掉
data_without_missing_values = original_data.dropna(axis=1)
cols_with_missing = [col for col in original_data.columns
if original_data[col].isnull().any()]
redued_original_data = original_data.drop(cols_with_missing, axis=1)
reduced_test_data = test_data.drop(cols_with_missing, axis=1)
- 补充
# 对于数值,默认填充平均
from sklearn.preprocessing import Imputer
my_imputer = Imputer()
data_with_imputed_values = my_imputer.fit_transform(original_data)
# make copy to avoid changing original data (when Imputing)
new_data = original_data.copy()
# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns
if new_data[c].isnull().any())
for col in cols_with_missing:
new_data[col + '_was_missing'] = new_data[col].isnull()
# Imputation
my_imputer = Imputer()
new_data = my_imputer.fit_transform(new_data)
pipeline
our pipeline must start with transformer steps and end with a model.