朴素贝叶斯分类算法
一.贝叶斯分类的原理
贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类。也就是说,贝叶斯分类器是最小错误率意义上的优化。
贝叶斯分类器是用于分类的贝叶斯网络。该网络中应包含类结点C ,其中C 的取值来自于类集合( c1 , c2 , ... , cm) ,还包含一组结点X = ( X1 , X2 , ... , Xn) ,表示用于分类的特征。对于贝叶斯网络分类器,若某一待分类的样本D ,其分类特征值为x = ( x1 , x2 , ... , x n) ,则样本D 属于类别ci 的概率P( C = ci | X1 = x1 , X2 = x 2 , ... , Xn = x n) ,( i = 1 ,2 , ... , m) 应满足下式:
P( C = ci | X = x) = Max{ P( C = c1 | X = x) , P( C = c2 | X = x ) , ... , P( C = cm | X = x ) }
贝叶斯公式:
P( C = ci | X = x) = P( X = x | C = ci) * P( C = ci) / P( X = x) 其中,P( C = ci) 可由领域专家的经验得到, 而P( X = x | C = ci) 和P( X = x) 的计算则较困难。
二.贝叶斯伪代码
整个算法可以分为两个部分,“建立模型”与“进行预测”,其建立模型的伪代码如下:
numAttrValues 等简单的数据从本地数据结构中直接读取 构建几个关键的计数表
for(为每一个实例) {
for( 每个属性 ){
为 numClassAndAttr 中当前类,当前属性,当前取值的单元加 1 为 attFrequencies 中当前取值单元加 1
}
}
预测的伪代码如下:
for(每一个类别){
for(对每个属性 xj){
for(对每个属性 xi){
if(F(xi)小于 m){
出现的次数没有超过阀值,不予考虑
}else {
查 numClassAndAttr 计算 F(y, xi, xj)
}
}
查 numClassAndAttr 计算 F(y, xi)
}
计算公式(7)中的评价函数,记录下这个类别的评价函数值
}
得到了各个类别上的条件概率,再带入贝叶斯公式算出最后的概率
三、算法实现代码
(1)建立模型
void GetModel::TrainModel()
{
int SplitVal;
Cache();//为模型分配内存
for (int i=0;iMaxAttNo;i++)
{
if (!gi->MaxAttValNo[i])
SplitPoint[i]=gi->SplitContinuousAtt(i);
}
for (int i=0;iMaxItemNo;i++)
{
for (int j=0;jMaxAttNo;j++)
{
if (gi->MaxAttValNo[j])
PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][gi->Item[i][j].DiscrValue]++;
else
{
if (gi->Item[i][j].continuousVal
SplitVal=1;
else
SplitVal=2;
PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][SplitVal]++;
}
}
}
}
void GetModel::Cache()
{
PostFreq=(float ***) calloc(gi->MaxClassNo+1,sizeof (float **));
for (int i=0;iMaxClassNo;i++)
{
PostFreq[i]=(float **) calloc(gi->MaxAttNo+1,sizeof (float *)); for (int j=0;jMaxAttNo;j++)
{
if (gi->MaxAttValNo[j])
PostFreq[i][j]=(float
*)calloc(gi->MaxAttValNo[j]+1,sizeof (float ));
else
PostFreq[i][j]=(float *)calloc(3,sizeof (float ));
}
}
SplitPoint=(float *)calloc(gi->MaxAttNo+1,sizeof (float ));
}
(2)预测
PredictClass::PredictClass(GetInfo *i,GetModel *m,string
Attname):MaxClassNo(-1),MaxDiscrValNo(2),MaxAttNo (-1)
{
gi=i;
gm=m;
filename=name;
this ->Inc=1024;
AttFileName=Attname;
}
void PredictClass::Predict()
{
float *prob;
int BestI=-1;
float MaxProb=0;
int j;
GetName();
GetData();
prob=(float *)calloc(MaxClassNo+1,sizeof (float ));
for (int i=0;i
prob[i]=1.0;
for (int i=0;i
{ name,string
for (j=0;j
{
for (int k=0;k
{
if (MaxAttValNo[k])
prob[j]*=(gm->PostFreq[j][k][Item[i][k].DiscrValue]/gi->ClassFreq[j]); else
{
if (Item[i][k].continuousValSplitPoint[k])
prob[j]*=(gm->PostFreq[j][k][1]/gi->ClassFreq[j]); else
prob[j]*=(gm->PostFreq[j][k][2]/gi->ClassFreq[j]); }
}
prob[j]*=(gi->ClassFreq[j]/(gi->MaxItemNo+1));
if (prob[j]>MaxProb)
{
MaxProb=prob[j];
BestI=j;
}
}
cout
Item[i][MaxAttNo+1].DiscrValue=BestI;
/*重置*/
{
BestI=-1;
MaxProb=0;
for (int i=0;i
prob[i]=1.0;
}
}
show();
}
void PredictClass::show()
{
int i,j;
for ( i=0;i
{
for ( j=0;j
{
if (MaxAttValNo[j])
coutelse
cout
}
cout
}
}
void PredictClass::GetData()
{
FILE *Nf;
char Fn[50];
filename=filename+".data" ;
filename.copy(Fn,filename.length());
Fn[filename.length()]=NULL;
if ( ! ( Nf = fopen(Fn, "r" ) ) )
Error(0, Fn, "" );
int ItemNo=0;
int ItemSpace=0;
do
{
MaxItemNo = ItemNo;
/* Make sure there is room for another item */
if ( ItemNo >= ItemSpace )
{
if ( ItemSpace )
{
ItemSpace += Inc;
Item = (Description
ItemSpace*sizeof (Description));
}
else
{
Item =
*)malloc((ItemSpace=Inc)*sizeof (Description));
}
}
Item[ItemNo] = GetDescription(Nf);
}while ( Item[ItemNo] != 0 && ++ItemNo);
fclose(Nf);
MaxItemNo=ItemNo - 1;
}
void PredictClass::Error(int n, string s1, string s2)
/* ----- */
{
cout
switch (n)
{
case 0: cout
break ;
case 1: cout
break ;
case 2: cout
case 3:cout
break ;
case 4:cout
break ;
case 5:cout
}
cout
exit(1);
}
Description PredictClass::GetDescription(FILE *Df)
{
int AttNo;/* attribute number, 0..MaxAttNo */
char name[50], *endname;
int Dv;
float Cv;
Description Dvec;
if ( ReadName(Df, name) )
{
Dvec = (Description)calloc(MaxAttNo+2, sizeof (AttValue));//因为aMaxAttNo 是从a 开始计数的在加上类别属性,所以+2
for (AttNo=0;AttNo
{
if ( MaxAttValNo[AttNo])
{
/* Discrete value */
Dv = Which(name, AttValName[AttNo], 1, MaxAttValNo[AttNo]); if ( ! Dv )
{
Error(4, AttName[AttNo], name);
}
Dvec[AttNo].DiscrValue=Dv;
}
else
{
/* Continuous value */
Cv = (float )strtod(name, &endname);
if ( endname == name || *endname != '\0' )
Error(4, AttName[AttNo], name);
Dvec[AttNo].continuousVal=Cv;
}
ReadName(Df,name);
}
return Dvec;
}
else
{
return 0;
}
}
void PredictClass::GetName()
{
FILE *Nf;
char Buffer[1000];
char Fn[100];
int AttCeiling=100;
int ClassCeiling=100;
int ValCeiling;
AttFileName.copy(Fn,AttFileName.length());
Fn[AttFileName.length()]=NULL;
strcat_s(Fn,".names" );
if ( ! ( Nf = fopen(Fn, "r" ) ) )
Error(0, Fn, "" );
ClassName = (string *) calloc(ClassCeiling, sizeof (string));
do
{
ReadName(Nf, Buffer);
if ( ++MaxClassNo >= ClassCeiling)
{
ClassCeiling += 100;
ClassName = (string *) realloc(ClassName, ClassCeiling*sizeof (string));
}
ClassName[MaxClassNo]=string(Buffer);
}
while ( Delimiter == ',' );
/* Get attribute and attribute value names from names file */
AttName = (string *) calloc(AttCeiling, sizeof (string));
MaxAttValNo = (short *) calloc(AttCeiling, sizeof (short ));
AttValName = (string **) calloc(AttCeiling, sizeof (string *));
//SpecialStatus = (char *) malloc(AttCeiling);
while ( ReadName(Nf, Buffer) )
{
if ( Delimiter != ':' )
Error(1, Buffer, "" );
if ( ++MaxAttNo >= AttCeiling )//扩大空间
{
AttCeiling += 100;
AttName = (string *) realloc(AttName, AttCeiling*sizeof (string)); MaxAttValNo = (short *) realloc(MaxAttValNo, AttCeiling*sizeof (short ));
AttValName = (string **) realloc(AttValName, AttCeiling*sizeof (string *));
//SpecialStatus = (char *) realloc(SpecialStatus, AttCeiling); }
AttName[MaxAttNo] = string(Buffer);
//SpecialStatus[MaxAttNo] = 0;
MaxAttValNo[MaxAttNo] = 0;
ValCeiling = 100;
AttValName[MaxAttNo] = (string *) calloc(ValCeiling, sizeof (string)); do
{
if ( ! ( ReadName(Nf, Buffer) ) )
Error(2, AttName[MaxAttNo], "" );
if ( ++MaxAttValNo[MaxAttNo] >= ValCeiling )
{
ValCeiling += 100;
AttValName[MaxAttNo] =(string *) realloc(AttValName[MaxAttNo], ValCeiling*sizeof (string));
}
AttValName[MaxAttNo][MaxAttValNo[MaxAttNo]] = string(Buffer); }while ( Delimiter == ',' );
if ( MaxAttValNo[MaxAttNo] == 1 )
{
/* Check for special treatment */
if (!strcmp(Buffer, "continuous" ) )
{
//MaxContAttNo++;
}
else
{
/* Cannot have only one discrete value for an attribute */ Error(3, AttName[MaxAttNo], "" );
}
MaxAttValNo[MaxAttNo] = 0;
}
else if ( MaxAttValNo[MaxAttNo] > MaxDiscrValNo )
MaxDiscrValNo = MaxAttValNo[MaxAttNo];
}
fclose(Nf);
}
bool PredictClass::ReadName(FILE *f,char *Buffer)
{
register char *Sp=Buffer;
register int c;
while ( ( c = getc(f) ) == '|' || space(c) )
{
if ( c == '|' )
SkipComment;
}
if ( c == EOF )
{
Delimiter = EOF;
return false ;
}
//读数据
while ( c != ':' && c != ',' && c != '\n' && c != '|' && c != EOF ) {
if ( c == '.' )
{
if ( ( c = getc(f) ) == '|' || space(c) ) //遇到‘。’后面如果是回车换行注释了,就退出
break ;
*Sp++ = '.' ;
}
if ( c == '\\' )//如果是'\\'就跳过
{
c = getc(f);
}
*Sp++ = c;
if ( c == ' ' )//跳过空格
{
while ( ( c = getc(f) ) == ' ' );
}
else
{
c = getc(f);
}
}
if ( c == '|' )
SkipComment;
Delimiter = c;
/* Strip trailing spaces */
while ( space(*(Sp-1)) ) Sp--;
*Sp++ = '\0';
return true ;
}
int PredictClass::Which(string Val,string List[],short First,short Last) /* ----- */
{
short n=First;
while ( n
n++;
return ( n
}
测试结果
其中原始数据在buycom.data, 测试数据在buytest.data, 测试数据类别集合为: youth medium yes fair。
最大概率为:P(X|buys_computer=yes)P(buys_computer=yes)=0.0282187,对于元组X ,朴素贝叶斯分类器预测元组X 的类别为buys_computer=yes.所以在分类的元组为youth medium yes fair时,预测结果为yes 。当要再次测试数据时,则需要修改buytest.data 中的测试数据。
朴素贝叶斯分类算法
一.贝叶斯分类的原理
贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类。也就是说,贝叶斯分类器是最小错误率意义上的优化。
贝叶斯分类器是用于分类的贝叶斯网络。该网络中应包含类结点C ,其中C 的取值来自于类集合( c1 , c2 , ... , cm) ,还包含一组结点X = ( X1 , X2 , ... , Xn) ,表示用于分类的特征。对于贝叶斯网络分类器,若某一待分类的样本D ,其分类特征值为x = ( x1 , x2 , ... , x n) ,则样本D 属于类别ci 的概率P( C = ci | X1 = x1 , X2 = x 2 , ... , Xn = x n) ,( i = 1 ,2 , ... , m) 应满足下式:
P( C = ci | X = x) = Max{ P( C = c1 | X = x) , P( C = c2 | X = x ) , ... , P( C = cm | X = x ) }
贝叶斯公式:
P( C = ci | X = x) = P( X = x | C = ci) * P( C = ci) / P( X = x) 其中,P( C = ci) 可由领域专家的经验得到, 而P( X = x | C = ci) 和P( X = x) 的计算则较困难。
二.贝叶斯伪代码
整个算法可以分为两个部分,“建立模型”与“进行预测”,其建立模型的伪代码如下:
numAttrValues 等简单的数据从本地数据结构中直接读取 构建几个关键的计数表
for(为每一个实例) {
for( 每个属性 ){
为 numClassAndAttr 中当前类,当前属性,当前取值的单元加 1 为 attFrequencies 中当前取值单元加 1
}
}
预测的伪代码如下:
for(每一个类别){
for(对每个属性 xj){
for(对每个属性 xi){
if(F(xi)小于 m){
出现的次数没有超过阀值,不予考虑
}else {
查 numClassAndAttr 计算 F(y, xi, xj)
}
}
查 numClassAndAttr 计算 F(y, xi)
}
计算公式(7)中的评价函数,记录下这个类别的评价函数值
}
得到了各个类别上的条件概率,再带入贝叶斯公式算出最后的概率
三、算法实现代码
(1)建立模型
void GetModel::TrainModel()
{
int SplitVal;
Cache();//为模型分配内存
for (int i=0;iMaxAttNo;i++)
{
if (!gi->MaxAttValNo[i])
SplitPoint[i]=gi->SplitContinuousAtt(i);
}
for (int i=0;iMaxItemNo;i++)
{
for (int j=0;jMaxAttNo;j++)
{
if (gi->MaxAttValNo[j])
PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][gi->Item[i][j].DiscrValue]++;
else
{
if (gi->Item[i][j].continuousVal
SplitVal=1;
else
SplitVal=2;
PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][SplitVal]++;
}
}
}
}
void GetModel::Cache()
{
PostFreq=(float ***) calloc(gi->MaxClassNo+1,sizeof (float **));
for (int i=0;iMaxClassNo;i++)
{
PostFreq[i]=(float **) calloc(gi->MaxAttNo+1,sizeof (float *)); for (int j=0;jMaxAttNo;j++)
{
if (gi->MaxAttValNo[j])
PostFreq[i][j]=(float
*)calloc(gi->MaxAttValNo[j]+1,sizeof (float ));
else
PostFreq[i][j]=(float *)calloc(3,sizeof (float ));
}
}
SplitPoint=(float *)calloc(gi->MaxAttNo+1,sizeof (float ));
}
(2)预测
PredictClass::PredictClass(GetInfo *i,GetModel *m,string
Attname):MaxClassNo(-1),MaxDiscrValNo(2),MaxAttNo (-1)
{
gi=i;
gm=m;
filename=name;
this ->Inc=1024;
AttFileName=Attname;
}
void PredictClass::Predict()
{
float *prob;
int BestI=-1;
float MaxProb=0;
int j;
GetName();
GetData();
prob=(float *)calloc(MaxClassNo+1,sizeof (float ));
for (int i=0;i
prob[i]=1.0;
for (int i=0;i
{ name,string
for (j=0;j
{
for (int k=0;k
{
if (MaxAttValNo[k])
prob[j]*=(gm->PostFreq[j][k][Item[i][k].DiscrValue]/gi->ClassFreq[j]); else
{
if (Item[i][k].continuousValSplitPoint[k])
prob[j]*=(gm->PostFreq[j][k][1]/gi->ClassFreq[j]); else
prob[j]*=(gm->PostFreq[j][k][2]/gi->ClassFreq[j]); }
}
prob[j]*=(gi->ClassFreq[j]/(gi->MaxItemNo+1));
if (prob[j]>MaxProb)
{
MaxProb=prob[j];
BestI=j;
}
}
cout
Item[i][MaxAttNo+1].DiscrValue=BestI;
/*重置*/
{
BestI=-1;
MaxProb=0;
for (int i=0;i
prob[i]=1.0;
}
}
show();
}
void PredictClass::show()
{
int i,j;
for ( i=0;i
{
for ( j=0;j
{
if (MaxAttValNo[j])
coutelse
cout
}
cout
}
}
void PredictClass::GetData()
{
FILE *Nf;
char Fn[50];
filename=filename+".data" ;
filename.copy(Fn,filename.length());
Fn[filename.length()]=NULL;
if ( ! ( Nf = fopen(Fn, "r" ) ) )
Error(0, Fn, "" );
int ItemNo=0;
int ItemSpace=0;
do
{
MaxItemNo = ItemNo;
/* Make sure there is room for another item */
if ( ItemNo >= ItemSpace )
{
if ( ItemSpace )
{
ItemSpace += Inc;
Item = (Description
ItemSpace*sizeof (Description));
}
else
{
Item =
*)malloc((ItemSpace=Inc)*sizeof (Description));
}
}
Item[ItemNo] = GetDescription(Nf);
}while ( Item[ItemNo] != 0 && ++ItemNo);
fclose(Nf);
MaxItemNo=ItemNo - 1;
}
void PredictClass::Error(int n, string s1, string s2)
/* ----- */
{
cout
switch (n)
{
case 0: cout
break ;
case 1: cout
break ;
case 2: cout
case 3:cout
break ;
case 4:cout
break ;
case 5:cout
}
cout
exit(1);
}
Description PredictClass::GetDescription(FILE *Df)
{
int AttNo;/* attribute number, 0..MaxAttNo */
char name[50], *endname;
int Dv;
float Cv;
Description Dvec;
if ( ReadName(Df, name) )
{
Dvec = (Description)calloc(MaxAttNo+2, sizeof (AttValue));//因为aMaxAttNo 是从a 开始计数的在加上类别属性,所以+2
for (AttNo=0;AttNo
{
if ( MaxAttValNo[AttNo])
{
/* Discrete value */
Dv = Which(name, AttValName[AttNo], 1, MaxAttValNo[AttNo]); if ( ! Dv )
{
Error(4, AttName[AttNo], name);
}
Dvec[AttNo].DiscrValue=Dv;
}
else
{
/* Continuous value */
Cv = (float )strtod(name, &endname);
if ( endname == name || *endname != '\0' )
Error(4, AttName[AttNo], name);
Dvec[AttNo].continuousVal=Cv;
}
ReadName(Df,name);
}
return Dvec;
}
else
{
return 0;
}
}
void PredictClass::GetName()
{
FILE *Nf;
char Buffer[1000];
char Fn[100];
int AttCeiling=100;
int ClassCeiling=100;
int ValCeiling;
AttFileName.copy(Fn,AttFileName.length());
Fn[AttFileName.length()]=NULL;
strcat_s(Fn,".names" );
if ( ! ( Nf = fopen(Fn, "r" ) ) )
Error(0, Fn, "" );
ClassName = (string *) calloc(ClassCeiling, sizeof (string));
do
{
ReadName(Nf, Buffer);
if ( ++MaxClassNo >= ClassCeiling)
{
ClassCeiling += 100;
ClassName = (string *) realloc(ClassName, ClassCeiling*sizeof (string));
}
ClassName[MaxClassNo]=string(Buffer);
}
while ( Delimiter == ',' );
/* Get attribute and attribute value names from names file */
AttName = (string *) calloc(AttCeiling, sizeof (string));
MaxAttValNo = (short *) calloc(AttCeiling, sizeof (short ));
AttValName = (string **) calloc(AttCeiling, sizeof (string *));
//SpecialStatus = (char *) malloc(AttCeiling);
while ( ReadName(Nf, Buffer) )
{
if ( Delimiter != ':' )
Error(1, Buffer, "" );
if ( ++MaxAttNo >= AttCeiling )//扩大空间
{
AttCeiling += 100;
AttName = (string *) realloc(AttName, AttCeiling*sizeof (string)); MaxAttValNo = (short *) realloc(MaxAttValNo, AttCeiling*sizeof (short ));
AttValName = (string **) realloc(AttValName, AttCeiling*sizeof (string *));
//SpecialStatus = (char *) realloc(SpecialStatus, AttCeiling); }
AttName[MaxAttNo] = string(Buffer);
//SpecialStatus[MaxAttNo] = 0;
MaxAttValNo[MaxAttNo] = 0;
ValCeiling = 100;
AttValName[MaxAttNo] = (string *) calloc(ValCeiling, sizeof (string)); do
{
if ( ! ( ReadName(Nf, Buffer) ) )
Error(2, AttName[MaxAttNo], "" );
if ( ++MaxAttValNo[MaxAttNo] >= ValCeiling )
{
ValCeiling += 100;
AttValName[MaxAttNo] =(string *) realloc(AttValName[MaxAttNo], ValCeiling*sizeof (string));
}
AttValName[MaxAttNo][MaxAttValNo[MaxAttNo]] = string(Buffer); }while ( Delimiter == ',' );
if ( MaxAttValNo[MaxAttNo] == 1 )
{
/* Check for special treatment */
if (!strcmp(Buffer, "continuous" ) )
{
//MaxContAttNo++;
}
else
{
/* Cannot have only one discrete value for an attribute */ Error(3, AttName[MaxAttNo], "" );
}
MaxAttValNo[MaxAttNo] = 0;
}
else if ( MaxAttValNo[MaxAttNo] > MaxDiscrValNo )
MaxDiscrValNo = MaxAttValNo[MaxAttNo];
}
fclose(Nf);
}
bool PredictClass::ReadName(FILE *f,char *Buffer)
{
register char *Sp=Buffer;
register int c;
while ( ( c = getc(f) ) == '|' || space(c) )
{
if ( c == '|' )
SkipComment;
}
if ( c == EOF )
{
Delimiter = EOF;
return false ;
}
//读数据
while ( c != ':' && c != ',' && c != '\n' && c != '|' && c != EOF ) {
if ( c == '.' )
{
if ( ( c = getc(f) ) == '|' || space(c) ) //遇到‘。’后面如果是回车换行注释了,就退出
break ;
*Sp++ = '.' ;
}
if ( c == '\\' )//如果是'\\'就跳过
{
c = getc(f);
}
*Sp++ = c;
if ( c == ' ' )//跳过空格
{
while ( ( c = getc(f) ) == ' ' );
}
else
{
c = getc(f);
}
}
if ( c == '|' )
SkipComment;
Delimiter = c;
/* Strip trailing spaces */
while ( space(*(Sp-1)) ) Sp--;
*Sp++ = '\0';
return true ;
}
int PredictClass::Which(string Val,string List[],short First,short Last) /* ----- */
{
short n=First;
while ( n
n++;
return ( n
}
测试结果
其中原始数据在buycom.data, 测试数据在buytest.data, 测试数据类别集合为: youth medium yes fair。
最大概率为:P(X|buys_computer=yes)P(buys_computer=yes)=0.0282187,对于元组X ,朴素贝叶斯分类器预测元组X 的类别为buys_computer=yes.所以在分类的元组为youth medium yes fair时,预测结果为yes 。当要再次测试数据时,则需要修改buytest.data 中的测试数据。