/* 1: Entropy of subtable with activation ON */
/*
* We now have the numbers of samples that match this part of the
* decision tree, and the number of samples for which the supplied
* condition are true. From these quantities we can find the negentropy of
* this partition.
*/
if (on_ctr > 0)
{
p_on = (REAL) p1 / (REAL) on_ctr;
p_off = 1 - p_on;
negentropy_on = -entropy (p_on) - entropy (p_off);
}
else
negentropy_on = 0.0;
/* 2: Entropy of subtable with activation OFF */
if (off_ctr > 0)
{
p_on = (REAL) p2 / (REAL) off_ctr;
p_off = 1 - p_on;
negentropy_off = -entropy (p_on) - entropy (p_off);
}
else
negentropy_off = 0.0;
printf("negentropy_on=%lf off=%lf on_ctr=%d off_ctr=%d p1=%d p2=%d\n",negentropy_on,negentropy_off,on_ctr,off_ctr,p1,p2);
ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);
ret_val.ne /= (on_ctr + off_ctr);
/*
* If all values examined were the same, set 'ret_val.status' to
* the target value since this will be an end-of-branch node
*/
if ((p1 == on_ctr) && (p2 == off_ctr))
ret_val.status = ON;
else if ((p1 == 0) && (p2 == 0))
ret_val.status = OFF;
else
ret_val.status = INACTIVE;
return ret_val;
}
/*-------------------------------------------------------------------*/
void print_tree (NODE *node, CHAR** tag_names)
{
/*
* Displays algebraic equivalent of the decision tree
*/
if (node->on == NULL)
{
if (node->idx == ON)
printf("ON");
else if (node->idx == OFF)
printf("OFF");
return;
}
else
printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);
print_tree ( node->on, tag_names );
printf(" else ");
print_tree ( node->off, tag_names );
printf( " } " );
}
/*-------------------------------------------------------------------*/
void read_matrix (CHAR filename[], MATRIX *matrix)
{
UINT i, j;
UINT height = matrix->height;
UINT width = matrix->width;
REAL **data = matrix->data;
FILE *f;
/* Open price file */
if ((f = fopen(filename, "r")) == NULL)
{
printf("\n File not found : %s\n", filename);
err_exit (__FILE__, __LINE__);
}
for (i=0; i for (j=0; j {
fscanf(f, "%lf,", &data[i][j] );
printf("%lf ",data[i][j]);
}
fclose(f);
printf("\nRead Matrix Finish!\n");
}
/*-------------------------------------------------------------------*/
CHAR **read_tags (CHAR *tag_file, UINT width)
{
FILE *f;
CHAR **_varname;
UINT i;
CHAR buffer[0xFF];
f = fopen(tag_file, "r");
if (f == NULL)
{
printf("\n File not found : %s\n", tag_file);
err_exit (__FILE__, __LINE__);
}
_varname = (CHAR**) malloc (width * sizeof (CHAR*));
for (i=0; i _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));
i = 0;
while (!feof(f))
{
if (fgets(buffer, 0xFF, f) != NULL)
{
if (strlen(buffer) > strlen("\n"))
{
if (i>width-1)
{
printf("\nMore variable names were detected than data items.%d",width);
printf("\nPlease correct this problem before proceeding");
exit(0);
}
sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);
i++;
}
}
}
if (i {
printf("\nFewer variable names than data items were detected.%d",i);
printf("\nPlease correct this problem before proceeding");
exit(0);
}
fclose (f);
return _varname;
}
样本的例子参考了matlab的数据名为fisheriris,这是数据挖掘书上经典的鸢尾花种类的数据。把setosa改为1,versicolor改为2,virginica改为3。由于这个代码只能进行2分类,所以1为1类对应OFF,23为另外一类ON。
1.dat
5.1,3.5,1.4,0.2,1
4.9,3,1.4,0.2,1
4.7,3.2,1.3,0.2,1
4.6,3.1,1.5,0.2,1
5,3.6,1.4,0.2,1
5.4,3.9,1.7,0.4,1
4.6,3.4,1.4,0.3,1
5,3.4,1.5,0.2,1
4.4,2.9,1.4,0.2,1
4.9,3.1,1.5,0.1,1
5.4,3.7,1.5,0.2,1
4.8,3.4,1.6,0.2,1
4.8,3,1.4,0.1,1
4.3,3,1.1,0.1,1
5.8,4,1.2,0.2,1
5.7,4.4,1.5,0.4,1
5.4,3.9,1.3,0.4,1
5.1,3.5,1.4,0.3,1
5.7,3.8,1.7,0.3,1
5.1,3.8,1.5,0.3,1
5.4,3.4,1.7,0.2,1
5.1,3.7,1.5,0.4,1
4.6,3.6,1,0.2,1
5.1,3.3,1.7,0.5,1
4.8,3.4,1.9,0.2,1
5,3,1.6,0.2,1
5,3.4,1.6,0.4,1
5.2,3.5,1.5,0.2,1
5.2,3.4,1.4,0.2,1
4.7,3.2,1.6,0.2,1
4.8,3.1,1.6,0.2,1
5.4,3.4,1.5,0.4,1
5.2,4.1,1.5,0.1,1
5.5,4.2,1.4,0.2,1
4.9,3.1,1.5,0.2,1
5,3.2,1.2,0.2,1
5.5,3.5,1.3,0.2,1
4.9,3.6,1.4,0.1,1
4.4,3,1.3,0.2,1
5.1,3.4,1.5,0.2,1
5,3.5,1.3,0.3,1
4.5,2.3,1.3,0.3,1
4.4,3.2,1.3,0.2,1
5,3.5,1.6,0.6,1
5.1,3.8,1.9,0.4,1
4.8,3,1.4,0.3,1
5.1,3.8,1.6,0.2,1
4.6,3.2,1.4,0.2,1
5.3,3.7,1.5,0.2,1
5,3.3,1.4,0.2,1
7,3.2,4.7,1.4,2
6.4,3.2,4.5,1.5,2
6.9,3.1,4.9,1.5,2
5.5,2.3,4,1.3,2
6.5,2.8,4.6,1.5,2
5.7,2.8,4.5,1.3,2
6.3,3.3,4.7,1.6,2
4.9,2.4,3.3,1,2
6.6,2.9,4.6,1.3,2
5.2,2.7,3.9,1.4,2
5,2,3.5,1,2
5.9,3,4.2,1.5,2
6,2.2,4,1,2
6.1,2.9,4.7,1.4,2
5.6,2.9,3.6,1.3,2
6.7,3.1,4.4,1.4,2
5.6,3,4.5,1.5,2
5.8,2.7,4.1,1,2
6.2,2.2,4.5,1.5,2
5.6,2.5,3.9,1.1,2
5.9,3.2,4.8,1.8,2
6.1,2.8,4,1.3,2
6.3,2.5,4.9,1.5,2
6.1,2.8,4.7,1.2,2
6.4,2.9,4.3,1.3,2
6.6,3,4.4,1.4,2
6.8,2.8,4.8,1.4,2
6.7,3,5,1.7,2
6,2.9,4.5,1.5,2
5.7,2.6,3.5,1,2
5.5,2.4,3.8,1.1,2
5.5,2.4,3.7,1,2
5.8,2.7,3.9,1.2,2
6,2.7,5.1,1.6,2
5.4,3,4.5,1.5,2
6,3.4,4.5,1.6,2
6.7,3.1,4.7,1.5,2
6.3,2.3,4.4,1.3,2
5.6,3,4.1,1.3,2
5.5,2.5,4,1.3,2
5.5,2.6,4.4,1.2,2
6.1,3,4.6,1.4,2
5.8,2.6,4,1.2,2
5,2.3,3.3,1,2
5.6,2.7,4.2,1.3,2
5.7,3,4.2,1.2,2
5.7,2.9,4.2,1.3,2
6.2,2.9,4.3,1.3,2
5.1,2.5,3,1.1,2
5.7,2.8,4.1,1.3,2
6.3,3.3,6,2.5,3
5.8,2.7,5.1,1.9,3
7.1,3,5.9,2.1,3
6.3,2.9,5.6,1.8,3
6.5,3,5.8,2.2,3
7.6,3,6.6,2.1,3
4.9,2.5,4.5,1.7,3
7.3,2.9,6.3,1.8,3
6.7,2.5,5.8,1.8,3
7.2,3.6,6.1,2.5,3
6.5,3.2,5.1,2,3
6.4,2.7,5.3,1.9,3
6.8,3,5.5,2.1,3
5.7,2.5,5,2,3
5.8,2.8,5.1,2.4,3
6.4,3.2,5.3,2.3,3
6.5,3,5.5,1.8,3
7.7,3.8,6.7,2.2,3
7.7,2.6,6.9,2.3,3
6,2.2,5,1.5,3
6.9,3.2,5.7,2.3,3
5.6,2.8,4.9,2,3
7.7,2.8,6.7,2,3
6.3,2.7,4.9,1.8,3
6.7,3.3,5.7,2.1,3
7.2,3.2,6,1.8,3
6.2,2.8,4.8,1.8,3
6.1,3,4.9,1.8,3
6.4,2.8,5.6,2.1,3
7.2,3,5.8,1.6,3
7.4,2.8,6.1,1.9,3
7.9,3.8,6.4,2,3
6.4,2.8,5.6,2.2,3
6.3,2.8,5.1,1.5,3
6.1,2.6,5.6,1.4,3
7.7,3,6.1,2.3,3
6.3,3.4,5.6,2.4,3
6.4,3.1,5.5,1.8,3
6,3,4.8,1.8,3
6.9,3.1,5.4,2.1,3
6.7,3.1,5.6,2.4,3
6.9,3.1,5.1,2.3,3
5.8,2.7,5.1,1.9,3
6.8,3.2,5.9,2.3,3
6.7,3.3,5.7,2.5,3
6.7,3,5.2,2.3,3
6.3,2.5,5,1.9,3
6.5,3,5.2,2,3
6.2,3.4,5.4,2.3,3
5.9,3,5.1,1.8,3
1.tag
height
weight
lh
lw
species
编译后,这样执行:
id3.exe 1
去掉了一些帮助信息后,结果主要为:
Welcome to ID3
Last compiled on Jul 30 2011, 21:11:55
Read Matrix Finish!
neigentropy inactive
ID3 finish!
ID3 finish!
if { lh >= 3.00 then ON else OFF }