第四章内容《基于概率论的分类方法——朴素贝叶斯》循序渐进地讲解了三个例子:
- 简单的句子分类(abusive or not)
- 垃圾邮件过滤
- 从个人广告中获取区域倾向
简单的句子分类
- 获取训练集和测试集
- 分析训练集中的所有样本,得到测试集的词汇列表vocabList和每个样本的描述向量,即vocabList中的词是否在该样本中出现,若出现,对应的值为1,否则为0。
- 根据朴素贝叶斯公式计算测试集中每个样本的后验概率,得到每个样本的预测分类
需要注意并改进的两个地方
- 由于在计算后验概率的时候,是需要很多个概率值相乘,所有一旦其中有一个概率值为0,结果就会为0(无意义)。为了避免这种情况,将分子numerator初始化成1,将分母denominator初始化成2。
- 一般情况下,大多数概率值都偏小,当很多个很小的数相乘的时候,会出现underflow的现象,即在python中的结果为0。因此,可以采用取对数的方式将多个小概率值的乘法转变成多个小概率值取对数后的结果的加法。
import numpy as np
import math
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmatian', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 is not
return postingList, classVec
# create a list of all the unique words in all of our documents
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
# create the union of two sets
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print('the word:', word, 'is not in my Vocabulary!')
return returnVec
'''
def bagOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
'''
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
# numerator / denominator
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
# vector addition
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# to avoid underflow based on the equation log(a * b) = log(a) + log(b)
p0Vect = np.log(p0Num / p0Denom)
p1Vect = np.log(p1Num / p1Denom)
return p0Vect, p1Vect, pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
# vec2Classify is a vector to classify
p1 = sum(vec2Classify * p1Vec) + math.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + math.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postInDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postInDoc))
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
testEntry = ['love', 'my', 'dalmatian']
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage']
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
从个人广告中获取区域倾向
待更新