朴素贝叶斯算法是应用最为广泛的分类算法之一。简称NB算法。可以用来检测异常操作,检测DGA域名,检测针对Apache的DDos攻击,检测基于MNIST数据集的验证码。
朴素贝叶斯算法包括以下算法
高斯朴素贝叶斯算法
多项式朴素贝叶斯算法
伯努利朴素贝叶斯
1.hellobeiyesi
# coding: utf-8
from sklearn import datasets
iris = datasets.load_iris()
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(iris.data,iris.target).predict(iris.data)#训练并预测数据
print ("Number of mislabeled points out of total %d points : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))
2.检测异常操作
操作思路:
1.数据的搜集与清洗
2.特征化
3.训练模型
4.效果验证
检测异常操作:
# -*- coding:utf-8 -*-
import sys
import urllib
import urlparse
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import HTMLParser
import nltk#用来分词
import csv
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
N = 90
def load_user_cmd_new(filename):
cmd_list=[]
dist = []
with open(filename) as f:
i=0
x=[]
for line in f:
line = line.strip('\n')
x.append(line)
dist.append(line)
i+=1
if i == 100:
cmd_list.append(x)
x=[]
i=0
fdist=FreqDist(dist).keys()
return cmd_list,fdist
def load_user_cmd(filename):
cmd_list=[]
dist_max=[]
dist_min=[]
dist=[]
with open(filename) as f:
i=0
x=[]
for line in f:
line=line.strip('\n')
x.append(line)
dist.append(line)
i+=1
if i == 100:
cmd_list.append(x)
x=[]
i=0
fdist = FreqDist(dist).keys()
dist_max = set(fdist[0:50])
dist_min = set(fdist[-50:])
return cmd_list,dist_max,dist_min
#以下是提取特征
def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):
user_cmd_feature=[]
for cmd_block in user_cmd_list:
f1=len(set(cmd_block))
fdist = FreqDist(cmd_block).keys()
f2 = fdist[0:10]
f3 = fdist[-10:]
f2 = len(set(f2) & set(dist_max))
f3 = len(set(f3)&set(dist_min))
x = [f1,f2,f3]
user_cmd_feature.append(x)
return user_cmd_feature
def get_user_cmd_feature_new(user_cmd_list,dist):
user_cmd_feature=[]
for cmd_list in user_cmd_list:
v=[0]*len(dist)
for i in range(0,len(dist)):
if dist[i] in cmd_list:
v[i]+=1
user_cmd_feature.append(v)
return user_cmd_feature
def get_label(filename,index=0):
x=[]
with open(filename) as f:
for line in f:
line=line.strip('\n')
x.append(int(line.split()[index]))
return x
if __name__ == '__main__':
user_cmd_list,dist=load_user_cmd_new("/home/qin/code/python/web-ml/1book-master/data/MasqueradeDat/User3")
user_cmd_feature=get_user_cmd_feature_new(user_cmd_list,dist)
labels=get_label("/home/qin/code/python/web-ml/1book-master/data/MasqueradeDat/label.txt",2)
y=[0]*50+labels
x_train=user_cmd_feature[0:N]
y_train=y[0:N]
x_test = user_cmd_feature[N:150]
y_test = y[N:150]
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train,y_train)
y_predict_knn=neigh.predict(x_test)
print y_train
clf = GaussianNB().fit(x_train,y_train)
y_predict_nb=clf.predict(x_test)
score=np.mean(y_test==y_predict_knn)*100
print "KNN %d " % score
score=np.mean(y_test==y_predict_nb)*100
print "NB %d" % score
结果:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
KNN 83
NB 83
2.检测WebShell
# -*- coding:utf-8 -*-
import os
from sklearn.feature_extraction.text import CountVectorizer
import sys
import numpy as np
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
def load_file(file_path):
t=""
with open(file_path) as f:
for line in f:
line=line.strip('\n')
t+=line
return t
def load_files(path):
files_list=[]
for r,d,files in os.walk(path):
for file in files:
if file.endswith('.php'):
file_path=path+file
print "Load %s" % file_path
t=load_file(file_path)
files_list.append(t)
return files_list
if __name__=='__main__':
webshell_bigram_vectorizer=CountVectorizer(ngram_range=(2,2),decode_error="ignore",token_pattern= r'\b\w+\b',min_df=1)
#ngram_range表明基于2-gram,ignore表示忽略异常字符的影响,token_pattern表示按单词切分
webshell_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/PHP-WEBSHELL/xiaoma/")
x1=webshell_bigram_vectorizer.fit_transform(webshell_files_list).toarray()
y1=[1]*len(x1)
vocabulary=webshell_bigram_vectorizer.vocabulary_
wp_bigram_vectorizer = CountVectorizer(ngram_range=(2,2),decode_error="ignore",token_pattern=r"\b\w+\b",min_df=1,vocabulary=vocabulary)
wp_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/wordpress/")
x2=wp_bigram_vectorizer.fit_transform(wp_files_list).toarray()
y2=[0]*len(x2)
x=np.concatenate((x1,x2))
y=np.concatenate((y1,y2))
clf = GaussianNB()
print cross_validation.cross_val_score(clf,x,y,n_jobs=-1,cv=3)
3.shell2
import os
from sklearn.feature_extraction.text import CountVectorizer
import sys
import numpy as np
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
r_token_pattern=r'\b\w+\b\(|\'w+\''
def load_file(file_path):
t=""
with open(file_path) as f:
for line in f:
line=line.strip('\n')
t+=line
return t
def load_files(path):
files_list=[]
for r, d, files in os.walk(path):
for file in files:
if file.endswith('.php'):
file_path=path+file
#print "Load %s" % file_path
t=load_file(file_path)
files_list.append(t)
return files_list
if __name__ == '__main__':
webshell_bigram_vectorizer = CountVectorizer(ngram_range=(1, 1), decode_error="ignore",
token_pattern = r_token_pattern,min_df=1)
webshell_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/PHP-WEBSHELL/xiaoma")
x1=webshell_bigram_vectorizer.fit_transform(webshell_files_list).toarray()
y1=[1]*len(x1)
vocabulary=webshell_bigram_vectorizer.vocabulary_
wp_bigram_vectorizer = CountVectorizer(ngram_range=(1, 1), decode_error="ignore",
token_pattern = r_token_pattern,min_df=1,vocabulary=vocabulary)
wp_files_list=load_files("/home/qin/code/python/web-ml/1book-master/data/wordpress/")
x2=wp_bigram_vectorizer.transform(wp_files_list).toarray()
y2=[0]*len(x2)
x=np.concatenate((x1,x2))
y=np.concatenate((y1, y2))
clf = GaussianNB()
print vocabulary
print cross_validation.cross_val_score(clf,x,y,n_jobs=-1,cv=3)