朴素贝叶斯分类算法代码实现

朴素贝叶斯分类算法

一．贝叶斯分类的原理

贝叶斯分类器的分类原理是通过某对象的先验概率，利用贝叶斯公式计算出其后验概率，即该对象属于某一类的概率，选择具有最大后验概率的类作为该对象所属的类。也就是说，贝叶斯分类器是最小错误率意义上的优化。

贝叶斯分类器是用于分类的贝叶斯网络。该网络中应包含类结点C ，其中C 的取值来自于类集合( c1 , c2 , ... , cm) ，还包含一组结点X = ( X1 , X2 , ... , Xn) ，表示用于分类的特征。对于贝叶斯网络分类器，若某一待分类的样本D ，其分类特征值为x = ( x1 , x2 , ... , x n) ，则样本D 属于类别ci 的概率P( C = ci | X1 = x1 , X2 = x 2 , ... , Xn = x n) ，( i = 1 ,2 , ... , m) 应满足下式：

P( C = ci | X = x) = Max{ P( C = c1 | X = x) , P( C = c2 | X = x ) , ... , P( C = cm | X = x ) }

贝叶斯公式：

P( C = ci | X = x) = P( X = x | C = ci) * P( C = ci) / P( X = x) 其中，P( C = ci) 可由领域专家的经验得到, 而P( X = x | C = ci) 和P( X = x) 的计算则较困难。

二．贝叶斯伪代码

整个算法可以分为两个部分，“建立模型”与“进行预测”，其建立模型的伪代码如下：

numAttrValues 等简单的数据从本地数据结构中直接读取构建几个关键的计数表

for(为每一个实例) {

for( 每个属性 ){

为 numClassAndAttr 中当前类，当前属性，当前取值的单元加 1 为 attFrequencies 中当前取值单元加 1

}

预测的伪代码如下：

for(每一个类别){

for(对每个属性 xj){

for(对每个属性 xi){

if(F(xi)小于 m){

出现的次数没有超过阀值，不予考虑

}else {

查 numClassAndAttr 计算 F(y, xi, xj)

}

查 numClassAndAttr 计算 F(y, xi)

}

计算公式（7）中的评价函数，记录下这个类别的评价函数值

}

得到了各个类别上的条件概率，再带入贝叶斯公式算出最后的概率

三、算法实现代码

(1)建立模型

void GetModel::TrainModel()

{

int SplitVal;

Cache();//为模型分配内存

for (int i=0;iMaxAttNo;i++)

{

if (!gi->MaxAttValNo[i])

SplitPoint[i]=gi->SplitContinuousAtt(i);

}

for (int i=0;iMaxItemNo;i++)

{

for (int j=0;jMaxAttNo;j++)

{

if (gi->MaxAttValNo[j])

PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][gi->Item[i][j].DiscrValue]++;

else

{

if (gi->Item[i][j].continuousVal

SplitVal=1;

else

SplitVal=2;

PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][SplitVal]++;

}

void GetModel::Cache()

{

PostFreq=(float ***) calloc(gi->MaxClassNo+1,sizeof (float **));

for (int i=0;iMaxClassNo;i++)

{

PostFreq[i]=(float **) calloc(gi->MaxAttNo+1,sizeof (float *)); for (int j=0;jMaxAttNo;j++)

{

if (gi->MaxAttValNo[j])

PostFreq[i][j]=(float

*)calloc(gi->MaxAttValNo[j]+1,sizeof (float ));

else

PostFreq[i][j]=(float *)calloc(3,sizeof (float ));

}

SplitPoint=(float *)calloc(gi->MaxAttNo+1,sizeof (float ));

}

(2)预测

PredictClass::PredictClass(GetInfo *i,GetModel *m,string

Attname):MaxClassNo(-1),MaxDiscrValNo(2),MaxAttNo (-1)

{

gi=i;

gm=m;

filename=name;

this ->Inc=1024;

AttFileName=Attname;

}

void PredictClass::Predict()

{

float *prob;

int BestI=-1;

float MaxProb=0;

int j;

GetName();

GetData();

prob=(float *)calloc(MaxClassNo+1,sizeof (float ));

for (int i=0;i

prob[i]=1.0;

for (int i=0;i

{ name,string

for (j=0;j

{

for (int k=0;k

{

if (MaxAttValNo[k])

prob[j]*=(gm->PostFreq[j][k][Item[i][k].DiscrValue]/gi->ClassFreq[j]); else

{

if (Item[i][k].continuousValSplitPoint[k])

prob[j]*=(gm->PostFreq[j][k][1]/gi->ClassFreq[j]); else

prob[j]*=(gm->PostFreq[j][k][2]/gi->ClassFreq[j]); }

}

prob[j]*=(gi->ClassFreq[j]/(gi->MaxItemNo+1));

if (prob[j]>MaxProb)

{

MaxProb=prob[j];

BestI=j;

}

cout

Item[i][MaxAttNo+1].DiscrValue=BestI;

/*重置*/

{

BestI=-1;

MaxProb=0;

for (int i=0;i

prob[i]=1.0;

}

show();

}

void PredictClass::show()

{

int i,j;

for ( i=0;i

{

for ( j=0;j

{

if (MaxAttValNo[j])

coutelse

cout

}

cout

}

void PredictClass::GetData()

{

FILE *Nf;

char Fn[50];

filename=filename+".data" ;

filename.copy(Fn,filename.length());

Fn[filename.length()]=NULL;

if ( ! ( Nf = fopen(Fn, "r" ) ) )

Error(0, Fn, "" );

int ItemNo=0;

int ItemSpace=0;

{

MaxItemNo = ItemNo;

/* Make sure there is room for another item */

if ( ItemNo >= ItemSpace )

{

if ( ItemSpace )

{

ItemSpace += Inc;

Item = (Description

ItemSpace*sizeof (Description));

}

else

{

Item =

*)malloc((ItemSpace=Inc)*sizeof (Description));

}

Item[ItemNo] = GetDescription(Nf);

}while ( Item[ItemNo] != 0 && ++ItemNo);

fclose(Nf);

MaxItemNo=ItemNo - 1;

}

void PredictClass::Error(int n, string s1, string s2)

/* ----- */

{

cout

switch (n)

{

case 0: cout

break ;

case 1: cout

break ;

case 2: cout

case 3:cout

break ;

case 4:cout

break ;

case 5:cout

}

cout

exit(1);

}

Description PredictClass::GetDescription(FILE *Df)

{

int AttNo;/* attribute number, 0..MaxAttNo */

char name[50], *endname;

int Dv;

float Cv;

Description Dvec;

if ( ReadName(Df, name) )

{

Dvec = (Description)calloc(MaxAttNo+2, sizeof (AttValue));//因为aMaxAttNo 是从a 开始计数的在加上类别属性，所以+2

for (AttNo=0;AttNo

{

if ( MaxAttValNo[AttNo])

{

/* Discrete value */

Dv = Which(name, AttValName[AttNo], 1, MaxAttValNo[AttNo]); if ( ! Dv )

{

Error(4, AttName[AttNo], name);

}

Dvec[AttNo].DiscrValue=Dv;

}

else

{

/* Continuous value */

Cv = (float )strtod(name, &endname);

if ( endname == name || *endname != '\0' )

Error(4, AttName[AttNo], name);

Dvec[AttNo].continuousVal=Cv;

}

ReadName(Df,name);

}

return Dvec;

}

else

{

return 0;

}

void PredictClass::GetName()

{

FILE *Nf;

char Buffer[1000];

char Fn[100];

int AttCeiling=100;

int ClassCeiling=100;

int ValCeiling;

AttFileName.copy(Fn,AttFileName.length());

Fn[AttFileName.length()]=NULL;

strcat_s(Fn,".names" );

if ( ! ( Nf = fopen(Fn, "r" ) ) )

Error(0, Fn, "" );

ClassName = (string *) calloc(ClassCeiling, sizeof (string));

{

ReadName(Nf, Buffer);

if ( ++MaxClassNo >= ClassCeiling)

{

ClassCeiling += 100;

ClassName = (string *) realloc(ClassName, ClassCeiling*sizeof (string));

}

ClassName[MaxClassNo]=string(Buffer);

}

while ( Delimiter == ',' );

/* Get attribute and attribute value names from names file */

AttName = (string *) calloc(AttCeiling, sizeof (string));

MaxAttValNo = (short *) calloc(AttCeiling, sizeof (short ));

AttValName = (string **) calloc(AttCeiling, sizeof (string *));

//SpecialStatus = (char *) malloc(AttCeiling);

while ( ReadName(Nf, Buffer) )

{

if ( Delimiter != ':' )

Error(1, Buffer, "" );

if ( ++MaxAttNo >= AttCeiling )//扩大空间

{

AttCeiling += 100;

AttName = (string *) realloc(AttName, AttCeiling*sizeof (string)); MaxAttValNo = (short *) realloc(MaxAttValNo, AttCeiling*sizeof (short ));

AttValName = (string **) realloc(AttValName, AttCeiling*sizeof (string *));

//SpecialStatus = (char *) realloc(SpecialStatus, AttCeiling); }

AttName[MaxAttNo] = string(Buffer);

//SpecialStatus[MaxAttNo] = 0;

MaxAttValNo[MaxAttNo] = 0;

ValCeiling = 100;

AttValName[MaxAttNo] = (string *) calloc(ValCeiling, sizeof (string)); do

{

if ( ! ( ReadName(Nf, Buffer) ) )

Error(2, AttName[MaxAttNo], "" );

if ( ++MaxAttValNo[MaxAttNo] >= ValCeiling )

{

ValCeiling += 100;

AttValName[MaxAttNo] =(string *) realloc(AttValName[MaxAttNo], ValCeiling*sizeof (string));

}

AttValName[MaxAttNo][MaxAttValNo[MaxAttNo]] = string(Buffer); }while ( Delimiter == ',' );

if ( MaxAttValNo[MaxAttNo] == 1 )

{

/* Check for special treatment */

if (!strcmp(Buffer, "continuous" ) )

{

//MaxContAttNo++;

}

else

{

/* Cannot have only one discrete value for an attribute */ Error(3, AttName[MaxAttNo], "" );

}

MaxAttValNo[MaxAttNo] = 0;

}

else if ( MaxAttValNo[MaxAttNo] > MaxDiscrValNo )

MaxDiscrValNo = MaxAttValNo[MaxAttNo];

}

fclose(Nf);

}

bool PredictClass::ReadName(FILE *f,char *Buffer)

{

while ( ( c = getc(f) ) == '|' || space(c) )

{

if ( c == '|' )

SkipComment;

}

if ( c == EOF )

{

Delimiter = EOF;

return false ;

}

//读数据

while ( c != ':' && c != ',' && c != '\n' && c != '|' && c != EOF ) {

if ( c == '.' )

{

if ( ( c = getc(f) ) == '|' || space(c) ) //遇到‘。’后面如果是回车换行注释了，就退出

break ;

*Sp++ = '.' ;

}

if ( c == '\\' )//如果是'\\'就跳过

{

c = getc(f);

}

*Sp++ = c;

if ( c == ' ' )//跳过空格

{

while ( ( c = getc(f) ) == ' ' );

}

else

{

c = getc(f);

}

if ( c == '|' )

SkipComment;

Delimiter = c;

/* Strip trailing spaces */

while ( space(*(Sp-1)) ) Sp--;

*Sp++ = '\0';

return true ;

}

int PredictClass::Which(string Val,string List[],short First,short Last) /* ----- */

{

short n=First;

while ( n

n++;

return ( n

}

测试结果

其中原始数据在buycom.data, 测试数据在buytest.data, 测试数据类别集合为： youth medium yes fair。

最大概率为：P(X|buys_computer=yes)P(buys_computer=yes)=0.0282187,对于元组X ，朴素贝叶斯分类器预测元组X 的类别为buys_computer=yes.所以在分类的元组为youth medium yes fair时，预测结果为yes 。当要再次测试数据时，则需要修改buytest.data 中的测试数据。

朴素贝叶斯分类算法

一．贝叶斯分类的原理

P( C = ci | X = x) = Max{ P( C = c1 | X = x) , P( C = c2 | X = x ) , ... , P( C = cm | X = x ) }

贝叶斯公式：

P( C = ci | X = x) = P( X = x | C = ci) * P( C = ci) / P( X = x) 其中，P( C = ci) 可由领域专家的经验得到, 而P( X = x | C = ci) 和P( X = x) 的计算则较困难。

二．贝叶斯伪代码

整个算法可以分为两个部分，“建立模型”与“进行预测”，其建立模型的伪代码如下：

numAttrValues 等简单的数据从本地数据结构中直接读取构建几个关键的计数表

for(为每一个实例) {

for( 每个属性 ){

为 numClassAndAttr 中当前类，当前属性，当前取值的单元加 1 为 attFrequencies 中当前取值单元加 1

}

预测的伪代码如下：

for(每一个类别){

for(对每个属性 xj){

for(对每个属性 xi){

if(F(xi)小于 m){

出现的次数没有超过阀值，不予考虑

}else {

查 numClassAndAttr 计算 F(y, xi, xj)

}

查 numClassAndAttr 计算 F(y, xi)

}

计算公式（7）中的评价函数，记录下这个类别的评价函数值

}

得到了各个类别上的条件概率，再带入贝叶斯公式算出最后的概率

三、算法实现代码

(1)建立模型

void GetModel::TrainModel()

{

int SplitVal;

Cache();//为模型分配内存

for (int i=0;iMaxAttNo;i++)

{

if (!gi->MaxAttValNo[i])

SplitPoint[i]=gi->SplitContinuousAtt(i);

}

for (int i=0;iMaxItemNo;i++)

{

for (int j=0;jMaxAttNo;j++)

{

if (gi->MaxAttValNo[j])

PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][gi->Item[i][j].DiscrValue]++;

else

{

if (gi->Item[i][j].continuousVal

SplitVal=1;

else

SplitVal=2;

PostFreq[gi->Item[i][gi->MaxAttNo+1].DiscrValue][j][SplitVal]++;

}

void GetModel::Cache()

{

PostFreq=(float ***) calloc(gi->MaxClassNo+1,sizeof (float **));

for (int i=0;iMaxClassNo;i++)

{

PostFreq[i]=(float **) calloc(gi->MaxAttNo+1,sizeof (float *)); for (int j=0;jMaxAttNo;j++)

{

if (gi->MaxAttValNo[j])

PostFreq[i][j]=(float

*)calloc(gi->MaxAttValNo[j]+1,sizeof (float ));

else

PostFreq[i][j]=(float *)calloc(3,sizeof (float ));

}

SplitPoint=(float *)calloc(gi->MaxAttNo+1,sizeof (float ));

}

(2)预测

PredictClass::PredictClass(GetInfo *i,GetModel *m,string

Attname):MaxClassNo(-1),MaxDiscrValNo(2),MaxAttNo (-1)

{

gi=i;

gm=m;

filename=name;

this ->Inc=1024;

AttFileName=Attname;

}

void PredictClass::Predict()

{

float *prob;

int BestI=-1;

float MaxProb=0;

int j;

GetName();

GetData();

prob=(float *)calloc(MaxClassNo+1,sizeof (float ));

for (int i=0;i

prob[i]=1.0;

for (int i=0;i

{ name,string

for (j=0;j

{

for (int k=0;k

{

if (MaxAttValNo[k])

prob[j]*=(gm->PostFreq[j][k][Item[i][k].DiscrValue]/gi->ClassFreq[j]); else

{

if (Item[i][k].continuousValSplitPoint[k])

prob[j]*=(gm->PostFreq[j][k][1]/gi->ClassFreq[j]); else

prob[j]*=(gm->PostFreq[j][k][2]/gi->ClassFreq[j]); }

}

prob[j]*=(gi->ClassFreq[j]/(gi->MaxItemNo+1));

if (prob[j]>MaxProb)

{

MaxProb=prob[j];

BestI=j;

}

cout

Item[i][MaxAttNo+1].DiscrValue=BestI;

/*重置*/

{

BestI=-1;

MaxProb=0;

for (int i=0;i

prob[i]=1.0;

}

show();

}

void PredictClass::show()

{

int i,j;

for ( i=0;i

{

for ( j=0;j

{

if (MaxAttValNo[j])

coutelse

cout

}

cout

}

void PredictClass::GetData()

{

FILE *Nf;

char Fn[50];

filename=filename+".data" ;

filename.copy(Fn,filename.length());

Fn[filename.length()]=NULL;

if ( ! ( Nf = fopen(Fn, "r" ) ) )

Error(0, Fn, "" );

int ItemNo=0;

int ItemSpace=0;

{

MaxItemNo = ItemNo;

/* Make sure there is room for another item */

if ( ItemNo >= ItemSpace )

{

if ( ItemSpace )

{

ItemSpace += Inc;

Item = (Description

ItemSpace*sizeof (Description));

}

else

{

Item =

*)malloc((ItemSpace=Inc)*sizeof (Description));

}

Item[ItemNo] = GetDescription(Nf);

}while ( Item[ItemNo] != 0 && ++ItemNo);

fclose(Nf);

MaxItemNo=ItemNo - 1;

}

void PredictClass::Error(int n, string s1, string s2)

/* ----- */

{

cout

switch (n)

{

case 0: cout

break ;

case 1: cout

break ;

case 2: cout

case 3:cout

break ;

case 4:cout

break ;

case 5:cout

}

cout

exit(1);

}

Description PredictClass::GetDescription(FILE *Df)

{

int AttNo;/* attribute number, 0..MaxAttNo */

char name[50], *endname;

int Dv;

float Cv;

Description Dvec;

if ( ReadName(Df, name) )

{

Dvec = (Description)calloc(MaxAttNo+2, sizeof (AttValue));//因为aMaxAttNo 是从a 开始计数的在加上类别属性，所以+2

for (AttNo=0;AttNo

{

if ( MaxAttValNo[AttNo])

{

/* Discrete value */

Dv = Which(name, AttValName[AttNo], 1, MaxAttValNo[AttNo]); if ( ! Dv )

{

Error(4, AttName[AttNo], name);

}

Dvec[AttNo].DiscrValue=Dv;

}

else

{

/* Continuous value */

Cv = (float )strtod(name, &endname);

if ( endname == name || *endname != '\0' )

Error(4, AttName[AttNo], name);

Dvec[AttNo].continuousVal=Cv;

}

ReadName(Df,name);

}

return Dvec;

}

else

{

return 0;

}

void PredictClass::GetName()

{

FILE *Nf;

char Buffer[1000];

char Fn[100];

int AttCeiling=100;

int ClassCeiling=100;

int ValCeiling;

AttFileName.copy(Fn,AttFileName.length());

Fn[AttFileName.length()]=NULL;

strcat_s(Fn,".names" );

if ( ! ( Nf = fopen(Fn, "r" ) ) )

Error(0, Fn, "" );

ClassName = (string *) calloc(ClassCeiling, sizeof (string));

{

ReadName(Nf, Buffer);

if ( ++MaxClassNo >= ClassCeiling)

{

ClassCeiling += 100;

ClassName = (string *) realloc(ClassName, ClassCeiling*sizeof (string));

}

ClassName[MaxClassNo]=string(Buffer);

}

while ( Delimiter == ',' );

/* Get attribute and attribute value names from names file */

AttName = (string *) calloc(AttCeiling, sizeof (string));

MaxAttValNo = (short *) calloc(AttCeiling, sizeof (short ));

AttValName = (string **) calloc(AttCeiling, sizeof (string *));

//SpecialStatus = (char *) malloc(AttCeiling);

while ( ReadName(Nf, Buffer) )

{

if ( Delimiter != ':' )

Error(1, Buffer, "" );

if ( ++MaxAttNo >= AttCeiling )//扩大空间

{

AttCeiling += 100;

AttName = (string *) realloc(AttName, AttCeiling*sizeof (string)); MaxAttValNo = (short *) realloc(MaxAttValNo, AttCeiling*sizeof (short ));

AttValName = (string **) realloc(AttValName, AttCeiling*sizeof (string *));

//SpecialStatus = (char *) realloc(SpecialStatus, AttCeiling); }

AttName[MaxAttNo] = string(Buffer);

//SpecialStatus[MaxAttNo] = 0;

MaxAttValNo[MaxAttNo] = 0;

ValCeiling = 100;

AttValName[MaxAttNo] = (string *) calloc(ValCeiling, sizeof (string)); do

{

if ( ! ( ReadName(Nf, Buffer) ) )

Error(2, AttName[MaxAttNo], "" );

if ( ++MaxAttValNo[MaxAttNo] >= ValCeiling )

{

ValCeiling += 100;

AttValName[MaxAttNo] =(string *) realloc(AttValName[MaxAttNo], ValCeiling*sizeof (string));

}

AttValName[MaxAttNo][MaxAttValNo[MaxAttNo]] = string(Buffer); }while ( Delimiter == ',' );

if ( MaxAttValNo[MaxAttNo] == 1 )

{

/* Check for special treatment */

if (!strcmp(Buffer, "continuous" ) )

{

//MaxContAttNo++;

}

else

{

/* Cannot have only one discrete value for an attribute */ Error(3, AttName[MaxAttNo], "" );

}

MaxAttValNo[MaxAttNo] = 0;

}

else if ( MaxAttValNo[MaxAttNo] > MaxDiscrValNo )

MaxDiscrValNo = MaxAttValNo[MaxAttNo];

}

fclose(Nf);

}

bool PredictClass::ReadName(FILE *f,char *Buffer)

{

while ( ( c = getc(f) ) == '|' || space(c) )

{

if ( c == '|' )

SkipComment;

}

if ( c == EOF )

{

Delimiter = EOF;

return false ;

}

//读数据

while ( c != ':' && c != ',' && c != '\n' && c != '|' && c != EOF ) {

if ( c == '.' )

{

if ( ( c = getc(f) ) == '|' || space(c) ) //遇到‘。’后面如果是回车换行注释了，就退出

break ;

*Sp++ = '.' ;

}

if ( c == '\\' )//如果是'\\'就跳过

{

c = getc(f);

}

*Sp++ = c;

if ( c == ' ' )//跳过空格

{

while ( ( c = getc(f) ) == ' ' );

}

else

{

c = getc(f);

}

if ( c == '|' )

SkipComment;

Delimiter = c;

/* Strip trailing spaces */

while ( space(*(Sp-1)) ) Sp--;

*Sp++ = '\0';

return true ;

}

int PredictClass::Which(string Val,string List[],short First,short Last) /* ----- */

{

short n=First;

while ( n

n++;

return ( n

}

测试结果

其中原始数据在buycom.data, 测试数据在buytest.data, 测试数据类别集合为： youth medium yes fair。

朴素贝叶斯分类算法代码实现

相关文章