本文用的是sciki-learn库的iris数据集进行测试。用的模型也是最简单的,就是用贝叶斯定理P(A|B) = P(B|A)*P(A)/P(B),计算每个类别在样本中概率(代码中是pLabel变量)
以及每个类下每个特征的概率(代码中是pNum变量)。
写得比较粗糙,对于某个类下没有此特征的情况采用p=1/样本数量。
有什么错误有人发现麻烦提出,谢谢。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
[python] view plain copy # -*- coding:utf-8 -*- from numpy import * from sklearn import datasets import numpy as np class NaiveBayesClassifier( object ): def __init__( self ): self .dataMat = list () self .labelMat = list () self .pLabel = {} self .pNum = {} def loadDataSet( self ): iris = datasets.load_iris() self .dataMat = iris.data self .labelMat = iris.target labelSet = set (iris.target) labelList = [i for i in labelSet] labelNum = len (labelList) for i in range (labelNum): self .pLabel.setdefault(labelList[i]) self .pLabel[labelList[i]] = np. sum ( self .labelMat = = labelList[i]) / float ( len ( self .labelMat)) def seperateByClass( self ): seperated = {} for i in range ( len ( self .dataMat)): vector = self .dataMat[i] if self .labelMat[i] not in seperated: seperated[ self .labelMat[i]] = [] seperated[ self .labelMat[i]].append(vector) return seperated # 通过numpy array二维数组来获取每一维每种数的概率 def getProbByArray( self , data): prob = {} for i in range ( len (data[ 0 ])): if i not in prob: prob[i] = {} dataSetList = list ( set (data[:, i])) for j in dataSetList: if j not in prob[i]: prob[i][j] = 0 prob[i][j] = np. sum (data[:, i] = = j) / float ( len (data[:, i])) prob[ 0 ] = [ 1 / float ( len (data[:, 0 ]))] # 防止feature不存在的情况 return prob def train( self ): featureNum = len ( self .dataMat[ 0 ]) seperated = self .seperateByClass() t_pNum = {} # 存储每个类别下每个特征每种情况出现的概率 for label, data in seperated.iteritems(): if label not in t_pNum: t_pNum[label] = {} t_pNum[label] = self .getProbByArray(np.array(data)) self .pNum = t_pNum def classify( self , data): label = 0 pTest = np.ones( 3 ) for i in self .pLabel: for j in self .pNum[i]: if data[j] not in self .pNum[i][j]: pTest[i] * = self .pNum[i][ 0 ][ 0 ] else : pTest[i] * = self .pNum[i][j][data[j]] pMax = np. max (pTest) ind = np.where(pTest = = pMax) return ind[ 0 ][ 0 ] def test( self ): self .loadDataSet() self .train() pred = [] right = 0 for d in self .dataMat: pred.append( self .classify(d)) for i in range ( len ( self .labelMat)): if pred[i] = = self .labelMat[i]: right + = 1 print right / float ( len ( self .labelMat)) if __name__ = = '__main__' : NB = NaiveBayesClassifier() NB.test() |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/Incy_1218/article/details/52891209