第二章
1、混淆矩阵
import numpyas np
import matplotlib.pyplotas plt
from sklearn.metricsimport confusion_matrix
from sklearn.metricsimport classification_report
# Define sample labels #定义样本标签
true_labels= [2, 0, 0, 2, 4, 4, 1, 0, 3, 3, 3]
pred_labels= [2, 1, 0, 2, 4, 3, 1, 0, 1, 3, 3]
# Create confusion matrix创建混淆矩阵
confusion_mat= confusion_matrix(true_labels, pred_labels)
# Visualize confusion matrix可视化混淆矩阵
plt.imshow(confusion_mat, interpolation='nearest', cmap=plt.cm.gray)
plt.title('Confusion matrix')
plt.colorbar()
ticks= np.arange(5)
plt.xticks(ticks, ticks)
plt.yticks(ticks, ticks)
plt.ylabel('True labels')
plt.xlabel('Predicted labels')
plt.show()
# Classification report分级报告
targets= ['Class-0', 'Class-1', 'Class-2', 'Class-3', 'Class-4']
print('\n', classification_report(true_labels, pred_labels, target_names=targets))
2、数据预处理
import numpyas np
from sklearnimport preprocessing
input_data= np.array([[5.1, -2.9, 3.3],
[-1.2, 7.8, -6.1],
[3.9, 0.4, 2.1],
[7.3, -9.9, -4.5]])
# Binarize data 二进制数据
data_binarized= preprocessing.Binarizer(threshold=2.1).transform(input_data)
print("\nBinarized data:\n", data_binarized)
# Print mean and standard deviation 打印平均值和标准偏差
print("\nBEFORE:")
print("Mean =", input_data.mean(axis=0))
print("Std deviation =", input_data.std(axis=0))
# Remove mean 移除平均值
data_scaled= preprocessing.scale(input_data)
print("\nAFTER:")
print("Mean =", data_scaled.mean(axis=0))
print("Std deviation =", data_scaled.std(axis=0))
# Min max scaling 最小和最大比例
data_scaler_minmax= preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled_minmax= data_scaler_minmax.fit_transform(input_data)
print("\nMin max scaled data:\n", data_scaled_minmax)
# Normalize data 数据规范化
data_normalized_l1= preprocessing.normalize(input_data, norm='l1')
data_normalized_l2= preprocessing.normalize(input_data, norm='l2')
print("\nL1 normalized data:\n", data_normalized_l1)
print("\nL2 normalized data:\n", data_normalized_l2)
3.房价
import numpyas np
from sklearnimport datasets
from sklearn.svmimport SVR
from sklearn.metricsimport mean_squared_error, explained_variance_score
from sklearn.utilsimport shuffle
# Load housing data #下载房价数据
data= datasets.load_boston()
# Shuffle the data混乱数据
X, y= shuffle(data.data, data.target, random_state=7)
# Split the data into training and testing datasets将数据分成训练和测试数据集
num_training= int(0.8 * len(X))
X_train, y_train= X[:num_training], y[:num_training]
X_test, y_test= X[num_training:], y[num_training:]
# Create Support Vector Regression model创建支持向量回归模型
sv_regressor= SVR(kernel='linear', C=1.0, epsilon=0.1)
# Train Support Vector Regressor训练支持向量回归
sv_regressor.fit(X_train, y_train)
# Evaluate performance of Support Vector Regressor评价支持向量回归器的性能
y_test_pred= sv_regressor.predict(X_test)
mse= mean_squared_error(y_test, y_test_pred)
evs= explained_variance_score(y_test, y_test_pred)
print("\n#### Performance ####")
print("Mean squared error =", round(mse, 2))
print("Explained variance score =", round(evs, 2))
# Test the regressor on test datapoint在测试数据点上测试回归器
test_data= [3.7, 0, 18.4, 1, 0.87, 5.95, 91, 2.5052, 26, 666, 20.2, 351.34, 15.27]
print("\nPredicted price:", sv_regressor.predict([test_data])[0])
4.收入分类
import numpyas np
import matplotlib.pyplotas plt
from sklearnimport preprocessing
from sklearn.svmimport LinearSVC
from sklearn.multiclassimport OneVsOneClassifier
from sklearnimport model_selection
# Input file containing data 输入包含数据的文件
input_file= 'income_data.txt'
# Read the data 读取数据
X= []
y= []
count_class1= 0
count_class2= 0
max_datapoints= 25000
with open(input_file, 'r') as f:
for linein f.readlines():
if count_class1>= max_datapointsand count_class2>= max_datapoints:
break
if '?' in line:
continue
data= line[:-1].split(', ')
if data[-1] == '<=50K' and count_class1< max_datapoints:
X.append(data)
count_class1+= 1
if data[-1] == '>50K' and count_class2< max_datapoints:
X.append(data)
count_class2+= 1
# Convert to numpy array
X= np.array(X)
# Convert string data to numerical data 转换为numpy数组
label_encoder= []
X_encoded= np.empty(X.shape)
for i,itemin enumerate(X[0]):
if item.isdigit():
X_encoded[:, i] = X[:, i]
else:
label_encoder.append(preprocessing.LabelEncoder())
X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])
X= X_encoded[:, :-1].astype(int)
y= X_encoded[:, -1].astype(int)
# Create SVM classifier 创建SVM分类器
classifier= OneVsOneClassifier(LinearSVC(random_state=0))
# Train the classifier 训练分类器
classifier.fit(X, y)
# Cross validation 交叉验证
X_train, X_test, y_train, y_test= model_selection.train_test_split(X, y, test_size=0.2, random_state=5)
classifier= OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)
y_test_pred= classifier.predict(X_test)
# Compute the F1 score of the SVM classifier 计算SVM分类器的F1评分
f1= model_selection.cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)
print("F1 score: " + str(round(100*f1.mean(), 2)) + "%")
# Predict output for a test datapoint 预测测试数据点的输出
input_data= ['37', 'Private', '215646', 'HS-grad', '9', 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40', 'United-States']
# Encode test datapoint 编码测试数据
input_data_encoded= [-1] * len(input_data)
count= 0
for i, itemin enumerate(input_data):
if item.isdigit():
input_data_encoded[i] = int(input_data[i])
else:
input_data_encoded[i] = int(label_encoder[count].transform(input_data[i]))
count+= 1
input_data_encoded= np.array(input_data_encoded)
# Run classifier on encoded datapoint and print output 在编码的数据点上运行分类器并打印输出
predicted_class= classifier.predict(input_data_encoded)
print(label_encoder[-1].inverse_transform(predicted_class)[0])
5.标签编码
import numpyas np
from sklearnimport preprocessing
# Sample input labels 样本输入标签
input_labels= ['red', 'black', 'red', 'green', 'black', 'yellow', 'white']
# Create label encoder and fit the labels 创建标签编码器和适合的标签
encoder= preprocessing.LabelEncoder()
encoder.fit(input_labels)
# Print the mapping 打印映射
print("\nLabel mapping:")
for i, itemin enumerate(encoder.classes_):
print(item, '-->', i)
# Encode a set of labels using the encoder 使用编码器编码一组标签
test_labels= ['green', 'red', 'black']
encoded_values= encoder.transform(test_labels)
print("\nLabels =", test_labels)
print("Encoded values =", list(encoded_values))
# Decode a set of values using the encoder 使用编码器解码一组值
encoded_values= [3, 0, 4, 1]
decoded_list= encoder.inverse_transform(encoded_values)
print("\nEncoded values =", encoded_values)
print("Decoded labels =", list(decoded_list))
6.逻辑回归
import numpyas np
from sklearnimport linear_model
import matplotlib.pyplotas plt
from utilitiesimport visualize_classifier
# Define sample input data 定义输入数据样本
X= np.array([[3.1, 7.2], [4, 6.7], [2.9, 8], [5.1, 4.5], [6, 5], [5.6, 5], [3.3, 0.4], [3.9, 0.9], [2.8, 1], [0.5, 3.4], [1, 4], [0.6, 4.9]])
y= np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
# Create the logistic regression classifier 创建逻辑回归分类器
classifier= linear_model.LogisticRegression(solver='liblinear', C=1)
#classifier = linear_model.LogisticRegression(solver='liblinear', C=100)
# Train the classifier 训练分类器
classifier.fit(X, y)
# Visualize the performance of the classifier 分类器性能的可视化
visualize_classifier(classifier, X, y)
7.朴素贝叶斯
8.多元回归函数
import numpyas np
from sklearnimport linear_model
import sklearn.metricsas sm
from sklearn.preprocessingimport PolynomialFeatures
# Input file containing data 输入包含数据的文件
input_file= 'data_multivar_regr.txt'
# Load the data from the input file 从输入文件中下载数据
data= np.loadtxt(input_file, delimiter=',')
X, y= data[:, :-1], data[:, -1]
# Split data into training and testing 将数据分解为训练和测试
num_training= int(0.8 * len(X))
num_test= len(X) - num_training
# Training data 训练集数据
X_train, y_train= X[:num_training], y[:num_training]
# Test data 测试数据
X_test, y_test= X[num_training:], y[num_training:]
# Create the linear regressor model 创建线性回归模型
linear_regressor= linear_model.LinearRegression()
# Train the model using the training sets 使用训练集训练模型
linear_regressor.fit(X_train, y_train)
# Predict the output 预测输出
y_test_pred= linear_regressor.predict(X_test)
# Measure performance 测试性能
print("Linear Regressor performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2))
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2))
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2))
print("Explained variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))
# Polynomial regression 多项式回归
polynomial= PolynomialFeatures(degree=10)
X_train_transformed= polynomial.fit_transform(X_train)
datapoint= [[7.75, 6.35, 5.56]]
poly_datapoint= polynomial.fit_transform(datapoint)
poly_linear_model= linear_model.LinearRegression()
poly_linear_model.fit(X_train_transformed, y_train)
print("\nLinear regression:\n", linear_regressor.predict(datapoint))
print("\nPolynomial regression:\n", poly_linear_model.predict(poly_datapoint))
9.
import pickle
import numpyas np
from sklearnimport linear_model
import sklearn.metricsas sm
import matplotlib.pyplotas plt
# Input file containing data
input_file= 'data_singlevar_regr.txt'
# Read data
data= np.loadtxt(input_file, delimiter=',')
X, y= data[:, :-1], data[:, -1]
# Train and test split
num_training= int(0.8 * len(X))
num_test= len(X) - num_training
# Training data
X_train, y_train= X[:num_training], y[:num_training]
# Test data
X_test, y_test= X[num_training:], y[num_training:]
# Create linear regressor object
regressor= linear_model.LinearRegression()
# Train the model using the training sets
regressor.fit(X_train, y_train)
# Predict the output
y_test_pred= regressor.predict(X_test)
# Plot outputs
plt.scatter(X_test, y_test, color='green')
plt.plot(X_test, y_test_pred, color='black', linewidth=4)
plt.xticks(())
plt.yticks(())
plt.show()
# Compute performance metrics
print("Linear regressor performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2))
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2))
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2))
print("Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))
# Model persistence
output_model_file= 'model.pkl'
# Save the model
with open(output_model_file, 'wb') as f:
pickle.dump(regressor, f)
# Load the model
with open(output_model_file, 'rb') as f:
regressor_model= pickle.load(f)
# Perform prediction on test data
y_test_pred_new= regressor_model.predict(X_test)
print("\nNew mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred_new), 2))