Chinaunix首页 | 论坛 | 博客
  • 博客访问: 99640
  • 博文数量: 35
  • 博客积分: 1845
  • 博客等级: 上尉
  • 技术积分: 394
  • 用 户 组: 普通用户
  • 注册时间: 2007-09-17 17:06
文章分类

全部博文(35)

文章存档

2013年(2)

2012年(2)

2011年(24)

2010年(3)

2009年(4)

我的朋友

分类: C/C++

2011-07-30 21:00:52

网上很多ID3实现的代码,但都没有数据文件的例子,不管是matlab还是c语言版都没需要自己摸索设置例子.比较麻烦,刚搞清除c语言版的,matlab的还需要再研究研究.
 
这一个id3代码只能分析数值型的数据,只能进行二分类.
输入文件:
数据文件  .dat 是以逗号分割的数据保护,n行样本,m列属性
属性标签  .tag 表示属性的名称,属性个数必须个数与m相等,一行一个名称
主函数:
1.从.dat读取行数和列数,即样本个数n和属性个数m
2.从.tag文件中读取属性的名称,最后一个是分类的属性
3.申请n*m的矩阵空间
4.从.dat文件中,将数据读入矩阵
5.进行ID3分析
 为每个样本的每个属性值,计算信息增量,记录下最小的,按这个属性和值来划分
 关键是计算信息增量的函数,negentropy
6.输出决策树
 
proto.h
NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );
void print_tree ( NODE* , CHAR** );
void free_tree ( NODE  * );
NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );
void err_exit ( CHAR* , UINT );
MATRIX *build_matrix ( UINT, UINT );
void free_matrix ( MATRIX * );
void read_matrix ( CHAR *, MATRIX * );
void file_size ( CHAR * , UINT * , UINT * );
CHAR **read_tags ( CHAR * , UINT );
void free_tags ( CHAR **, UINT);
id3.h
typedef unsigned int  UINT;
typedef unsigned long ULONG;
typedef          char CHAR;
typedef unsigned char BOOL;
typedef double        REAL;
typedef struct node {
   UINT idx; /* ID code for attribute */
   REAL threshold; /* Numerical threshold for attribute test */
   struct node *on; /* Address of 'on' node */
   struct node *off; /* Address of 'off' node */
   struct node *parent; /* Addess of parent node */
} NODE;
typedef struct ne_struct {
    REAL ne;
    UINT status;
} NEGENTROPY;
typedef struct matrix {
   UINT width;
   UINT height;
   REAL **data;
} MATRIX;
enum UINT { INACTIVE, OFF, ON };
#define LN_2 0.693147180559945309417
#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)
id3.c

/*
* FILE: id3.c
*
* Author: Andrew Colin
*
* DISCLAIMER: No liability is assumed by the author for any use made
* of this program.
*
* DISTRIBUTION: Any use may be made of this program, as long as the
* clear acknowledgment is made to the author in code and runtime
* executables
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "id3.h"
#include "proto.h"
/*-------------------------------------------------------------------*/
MATRIX *build_matrix (UINT width, UINT height)
{
    MATRIX *_matrix;
    UINT i;
    _matrix = (MATRIX*) malloc (sizeof (MATRIX));
    if (!_matrix)
        err_exit (__FILE__, __LINE__);
    _matrix->width  = width;
    _matrix->height = height;
    _matrix->data = (REAL**) malloc (height * sizeof (REAL*));
    if (_matrix->data == NULL)
        err_exit(__FILE__, __LINE__);
    for (i=0; i    {
        _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));
        if (_matrix->data[i] == NULL)
            err_exit(__FILE__, __LINE__);
    }
    return _matrix;
}
/*-------------------------------------------------------------------*/
/*
* Standard error handler function
*/
void err_exit (CHAR* file, UINT line)
{
    printf("\n Fatal error in file %s, line %u", file, line);
    exit(0);
}
/*-------------------------------------------------------------------*/
void file_size (CHAR *file_name, UINT *width, UINT *height)
/*
* Given the name of a file of numeric data, this routine counts
* the numbers of rows and columns. It's assumed that the number
* of entries is the same in each row, and an error is flagged if this
* is not the case.
*
*/
{
    FILE *f;
    UINT buf_size = 0xFF, _width = 0;
    CHAR *buffer, *ptr;
    *width = *height = 0;
    buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
    if (buffer == NULL)
        err_exit (__FILE__, __LINE__);
    /* Open price file - abort if filename invalid */
    f = fopen(file_name, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", file_name);
        err_exit (__FILE__, __LINE__);
    }
    /* Get number of entries in first row */
    if (fgets(buffer, buf_size, f) != NULL)
    {
        ++*height;
        ptr = strtok (buffer, " ,");
        while (ptr != NULL)
        {
            ++*width;
            ptr = strtok (NULL, " ,");
        }
    }
    /* Count numbers of subsequent rows */
    while (!feof(f))
    {
        if (fgets(buffer, buf_size, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))  /* if line is more than a NL char */
            {
                ++*height;
                _width = 0;
                ptr = strtok (buffer, " ,");
                while (ptr != NULL)
                {
                    ++_width;
                    ptr = strtok (NULL, " ,");
                }
                if (*width != _width)
                {
                    printf("\n Number of entries in file %s did not agree", file_name);
                    err_exit (__FILE__, __LINE__);
                }
            }
        }
    }
    free (buffer);
}
/*-------------------------------------------------------------------*/
void free_matrix (MATRIX *_matrix)
{
    UINT i;
    for (i=0; i<_matrix->height; i++)
        free (_matrix->data[i]);
    free (_matrix->data);
    free (_matrix);
}
/*-------------------------------------------------------------------*/
void free_tags ( CHAR** varname, UINT width)
{
    UINT i;
    for (i=0; i        free(varname[i]);
    free (varname);
}
/*-------------------------------------------------------------------*/
void free_tree ( NODE  *node )
{
    /*
     *  Frees the memory allocated to a tree structure
     */
    if (node == NULL)
        return;
    else
    {
        free_tree (node->on);
        free_tree (node->off);
        free(node);
    }
}
/*-------------------------------------------------------------------*/
NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)
/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */
{
    NEGENTROPY negentropy_struct;
    NODE *node;
    UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
    REAL **data = matrix->data;
    REAL best_threshold, min_negentropy, _negentropy;
    /* Allocate memory for this node */
    node = (NODE*) malloc (sizeof(NODE));
    if (!node)
        err_exit (__FILE__, __LINE__);
    /* Set up links in decision tree */
    node->parent = parent;  /* Set address of parent node */
    if (parent != NULL) /* parent to child; not relevant for root node */
    {
        /* Pass address of this node to the parent node */
        if (state == ON)
            parent->on = node;
        else
            if (state == OFF)
                parent->off = node;
    }
    /*
     * Select attribute with lowest negentropy for splitting. Scan through
     * ALL attributes (except the target) and ALL data samples. This is
     * pretty inefficient for data sets with repeated values, but will do
     * for illustrative purposes
     */
    min_negentropy = 1.0;
    for (i=0; i    {
        for (j=0; j        {
            if (i != target)
            {
                /* Set trial values for this node... */
                node->idx = i;
                node->threshold = data[j][i];
                /* ...and calculate the negentropy of this partition */
                negentropy_struct = negentropy (data, n_samples, node, target);
                _negentropy = negentropy_struct.ne;
                /* If this negentropy is lower than any other, retain the
                       index and threshold for future use */
                if (_negentropy < min_negentropy)
                {
                    min_negentropy = _negentropy;
                    split = i;
                    best_threshold = data[j][i];
                }
            } /*if (i != target)*/
        } /*for (j=0; j
    } /*for (i=0; i
    /* Save the combination of best attribute and threshold value */
    node->idx = split;
    node->threshold = best_threshold;
    /*
     * If the negentropy routine found itself at an end-of-branch
     * for the decision tree, the 'status' flag in 'negentropy_struct'
     * is set to ON or OFF and the node labelled accordingly. Otherwise,
     * ID3 continues to call itself until all end-of-branch nodes are
     * found.
     */
    if  (negentropy_struct.status != INACTIVE)
    {
        node->on = node->off = NULL;
        node->idx = negentropy_struct.status;
        printf("neigentropy inactive\n");
    }
    else
    {
        node->on  = ID3 (matrix, node, target, ON);
        node->off = ID3 (matrix, node, target, OFF);
    }
   
    printf("ID3 finish!\n");
    return node;
   
}
/*-------------------------------------------------------------------*/
void main (int argv, char *argc[])
{
    MATRIX *matrix;
    NODE *node;
    UINT target, n_vars, n_samples;
    CHAR data_file[13], tag_file[13];  /* Longest file name in DOS */
    CHAR **tag_names;
    /* Set up file names */
    if (argv != 2)
    {
        printf("\nUsage: id3 [datafile]");
        exit(0);
    }
    else
    {
        printf("\nWelcome to ID3");
        printf("\nLast compiled on %s, %s", __DATE__, __TIME__);
        printf("\n");
        strcpy(data_file, argc[1]);
        strcpy(tag_file,  argc[1]);
        strcat(data_file, ".dat");
        strcat(tag_file,  ".tag");
    }
    /* Read dimensions of data file */
    file_size (data_file, &n_vars, &n_samples);
    /* Read labels for columns of data */
    tag_names = read_tags (tag_file, n_vars);
    /* Allocate storage for data... */
    matrix = build_matrix (n_vars, n_samples);
    /* ...and read it from disk */
    read_matrix (data_file, matrix);
    /* Classification target is last column */
    target = n_vars - 1;
    /* Return root of decision tree - ID3 continues to call itself
       recursively */
    node = ID3 ( matrix, NULL, target, 0 );
    print_tree(node, tag_names);
    printf("\n");
    free_tags (tag_names, n_vars);
    free_matrix(matrix);
    free_tree (node);
}
/*-------------------------------------------------------------------*/
NEGENTROPY negentropy ( REAL **data,
    UINT   n_samples,
    NODE   *local,
    UINT   target)
{
    /*
     * Calculates the entropy of classification of an attribute, given
     * a table of attributes already used, the attribute on which splitting
     * is to be taken, and the target attribute. Entropy is calculated in
     * bits, so logs are taken to base 2 by dividing by LN_2.
     *
     * The returned value always lies in the (closed) range [0, 1].
     */
    NEGENTROPY ret_val;
    NODE *_node, *_parent;
    UINT on_ctr, off_ctr, p1, p2, i, _match;
    REAL p_on, p_off, negentropy_on, negentropy_off;
    on_ctr = off_ctr = p1 = p2 = 0;
    /* Scan through all supplied data samples */
    for (i=0; i    {
        /*
         * If pattern matches the current position in the decision tree,
         * then use this vector. The match is determined by passing up
         * the decision tree and checking whether 'data[idx] >= threshold'
         * matches at each step, where idx and threshold are taken from
         * each node in turn.
         */
        _match = 1;
        _node = local;
        while (_node->parent != NULL)
        { /* If at the root node, all entries match*/
            _parent = _node->parent;
            if (_node == _parent->on)
            { /* if parent node is ON */
                if (data[i][_parent->idx] < _parent->threshold)
                    _match = 0;
            }
            else
                if (_node == _parent->off)
                { /* if parent node is OFF */
                    if (data[i][_parent->idx] >= _parent->threshold)
                        _match = 0;
                }
            _node = _parent;
        }
        if (_match)
        {
            if (data[i][local->idx] >= local->threshold)
            {
                on_ctr++;
                if (data[i][target] >= 2)
                    p1++;
               // printf(" %lf ",data[i][target]);
            }
            else
            {
                off_ctr++;
                if (data[i][target] >= 2)
                    p2++;
            }
        }
    }   /* for (i=0; i
    /* 1: Entropy of subtable with activation ON */
    /*
     * We now have the numbers of samples that match this part of the
     * decision tree, and the number of samples for which the supplied
     * condition are true. From these quantities we can find the negentropy of
     * this partition.
     */
    if (on_ctr > 0)
    {
        p_on  = (REAL) p1 / (REAL) on_ctr;
        p_off = 1 - p_on;
        negentropy_on = -entropy (p_on) - entropy (p_off);
    }
    else
        negentropy_on = 0.0;
    /* 2: Entropy of subtable with activation OFF */
    if (off_ctr > 0)
    {
        p_on  = (REAL) p2 / (REAL) off_ctr;
        p_off = 1 - p_on;
        negentropy_off = -entropy (p_on) - entropy (p_off);
    }
    else
        negentropy_off = 0.0;
    printf("negentropy_on=%lf off=%lf on_ctr=%d off_ctr=%d p1=%d p2=%d\n",negentropy_on,negentropy_off,on_ctr,off_ctr,p1,p2);
    ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);
    ret_val.ne /= (on_ctr + off_ctr);
    /*
     * If all values examined were the same, set 'ret_val.status' to
     * the target value since this will be an end-of-branch node
     */
    if ((p1 == on_ctr) && (p2 == off_ctr))
        ret_val.status = ON;
    else if ((p1 == 0) && (p2 == 0))
        ret_val.status = OFF;
    else
        ret_val.status = INACTIVE;
    return ret_val;
}
/*-------------------------------------------------------------------*/
void print_tree (NODE *node, CHAR** tag_names)
{
    /*
     * Displays algebraic equivalent of the decision tree
     */
    if (node->on == NULL)
    {
        if (node->idx == ON)
            printf("ON");
        else if (node->idx == OFF)
            printf("OFF");
        return;
    }
    else
        printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);
    print_tree ( node->on, tag_names );
    printf(" else ");
    print_tree ( node->off, tag_names );
    printf( " } " );
}
/*-------------------------------------------------------------------*/
void read_matrix (CHAR filename[], MATRIX *matrix)
{
    UINT i, j;
    UINT height = matrix->height;
    UINT width  = matrix->width;
    REAL **data = matrix->data;
    FILE *f;
    /* Open price file */
    if ((f = fopen(filename, "r")) == NULL)
    {
        printf("\n File not found : %s\n", filename);
        err_exit (__FILE__, __LINE__);
    }
    for (i=0; i        for (j=0; j        {
            fscanf(f, "%lf,", &data[i][j] );
            printf("%lf ",data[i][j]);
        }
    fclose(f);
    printf("\nRead Matrix Finish!\n");
}
/*-------------------------------------------------------------------*/
CHAR **read_tags (CHAR *tag_file, UINT width)
{
    FILE *f;
    CHAR **_varname;
    UINT i;
    CHAR buffer[0xFF];
    f = fopen(tag_file, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", tag_file);
        err_exit (__FILE__, __LINE__);
    }
    _varname = (CHAR**) malloc (width * sizeof (CHAR*));
    for (i=0; i        _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));
    i = 0;
    while (!feof(f))
    {
        if (fgets(buffer, 0xFF, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))
            {
                if (i>width-1)
                {
                    printf("\nMore variable names were detected than data items.%d",width);
                    printf("\nPlease correct this problem before proceeding");
                    exit(0);
                }
                sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);
                i++;
            }
        }
    }
    if (i    {
        printf("\nFewer variable names than data items were detected.%d",i);
        printf("\nPlease correct this problem before proceeding");
        exit(0);
    }
    fclose (f);
    return _varname;
}
 
样本的例子参考了matlab的数据名为fisheriris,这是数据挖掘书上经典的鸢尾花种类的数据。把setosa改为1,versicolor改为2,virginica改为3。由于这个代码只能进行2分类,所以1为1类对应OFF,23为另外一类ON。
1.dat
5.1,3.5,1.4,0.2,1
4.9,3,1.4,0.2,1
4.7,3.2,1.3,0.2,1
4.6,3.1,1.5,0.2,1
5,3.6,1.4,0.2,1
5.4,3.9,1.7,0.4,1
4.6,3.4,1.4,0.3,1
5,3.4,1.5,0.2,1
4.4,2.9,1.4,0.2,1
4.9,3.1,1.5,0.1,1
5.4,3.7,1.5,0.2,1
4.8,3.4,1.6,0.2,1
4.8,3,1.4,0.1,1
4.3,3,1.1,0.1,1
5.8,4,1.2,0.2,1
5.7,4.4,1.5,0.4,1
5.4,3.9,1.3,0.4,1
5.1,3.5,1.4,0.3,1
5.7,3.8,1.7,0.3,1
5.1,3.8,1.5,0.3,1
5.4,3.4,1.7,0.2,1
5.1,3.7,1.5,0.4,1
4.6,3.6,1,0.2,1
5.1,3.3,1.7,0.5,1
4.8,3.4,1.9,0.2,1
5,3,1.6,0.2,1
5,3.4,1.6,0.4,1
5.2,3.5,1.5,0.2,1
5.2,3.4,1.4,0.2,1
4.7,3.2,1.6,0.2,1
4.8,3.1,1.6,0.2,1
5.4,3.4,1.5,0.4,1
5.2,4.1,1.5,0.1,1
5.5,4.2,1.4,0.2,1
4.9,3.1,1.5,0.2,1
5,3.2,1.2,0.2,1
5.5,3.5,1.3,0.2,1
4.9,3.6,1.4,0.1,1
4.4,3,1.3,0.2,1
5.1,3.4,1.5,0.2,1
5,3.5,1.3,0.3,1
4.5,2.3,1.3,0.3,1
4.4,3.2,1.3,0.2,1
5,3.5,1.6,0.6,1
5.1,3.8,1.9,0.4,1
4.8,3,1.4,0.3,1
5.1,3.8,1.6,0.2,1
4.6,3.2,1.4,0.2,1
5.3,3.7,1.5,0.2,1
5,3.3,1.4,0.2,1
7,3.2,4.7,1.4,2
6.4,3.2,4.5,1.5,2
6.9,3.1,4.9,1.5,2
5.5,2.3,4,1.3,2
6.5,2.8,4.6,1.5,2
5.7,2.8,4.5,1.3,2
6.3,3.3,4.7,1.6,2
4.9,2.4,3.3,1,2
6.6,2.9,4.6,1.3,2
5.2,2.7,3.9,1.4,2
5,2,3.5,1,2
5.9,3,4.2,1.5,2
6,2.2,4,1,2
6.1,2.9,4.7,1.4,2
5.6,2.9,3.6,1.3,2
6.7,3.1,4.4,1.4,2
5.6,3,4.5,1.5,2
5.8,2.7,4.1,1,2
6.2,2.2,4.5,1.5,2
5.6,2.5,3.9,1.1,2
5.9,3.2,4.8,1.8,2
6.1,2.8,4,1.3,2
6.3,2.5,4.9,1.5,2
6.1,2.8,4.7,1.2,2
6.4,2.9,4.3,1.3,2
6.6,3,4.4,1.4,2
6.8,2.8,4.8,1.4,2
6.7,3,5,1.7,2
6,2.9,4.5,1.5,2
5.7,2.6,3.5,1,2
5.5,2.4,3.8,1.1,2
5.5,2.4,3.7,1,2
5.8,2.7,3.9,1.2,2
6,2.7,5.1,1.6,2
5.4,3,4.5,1.5,2
6,3.4,4.5,1.6,2
6.7,3.1,4.7,1.5,2
6.3,2.3,4.4,1.3,2
5.6,3,4.1,1.3,2
5.5,2.5,4,1.3,2
5.5,2.6,4.4,1.2,2
6.1,3,4.6,1.4,2
5.8,2.6,4,1.2,2
5,2.3,3.3,1,2
5.6,2.7,4.2,1.3,2
5.7,3,4.2,1.2,2
5.7,2.9,4.2,1.3,2
6.2,2.9,4.3,1.3,2
5.1,2.5,3,1.1,2
5.7,2.8,4.1,1.3,2
6.3,3.3,6,2.5,3
5.8,2.7,5.1,1.9,3
7.1,3,5.9,2.1,3
6.3,2.9,5.6,1.8,3
6.5,3,5.8,2.2,3
7.6,3,6.6,2.1,3
4.9,2.5,4.5,1.7,3
7.3,2.9,6.3,1.8,3
6.7,2.5,5.8,1.8,3
7.2,3.6,6.1,2.5,3
6.5,3.2,5.1,2,3
6.4,2.7,5.3,1.9,3
6.8,3,5.5,2.1,3
5.7,2.5,5,2,3
5.8,2.8,5.1,2.4,3
6.4,3.2,5.3,2.3,3
6.5,3,5.5,1.8,3
7.7,3.8,6.7,2.2,3
7.7,2.6,6.9,2.3,3
6,2.2,5,1.5,3
6.9,3.2,5.7,2.3,3
5.6,2.8,4.9,2,3
7.7,2.8,6.7,2,3
6.3,2.7,4.9,1.8,3
6.7,3.3,5.7,2.1,3
7.2,3.2,6,1.8,3
6.2,2.8,4.8,1.8,3
6.1,3,4.9,1.8,3
6.4,2.8,5.6,2.1,3
7.2,3,5.8,1.6,3
7.4,2.8,6.1,1.9,3
7.9,3.8,6.4,2,3
6.4,2.8,5.6,2.2,3
6.3,2.8,5.1,1.5,3
6.1,2.6,5.6,1.4,3
7.7,3,6.1,2.3,3
6.3,3.4,5.6,2.4,3
6.4,3.1,5.5,1.8,3
6,3,4.8,1.8,3
6.9,3.1,5.4,2.1,3
6.7,3.1,5.6,2.4,3
6.9,3.1,5.1,2.3,3
5.8,2.7,5.1,1.9,3
6.8,3.2,5.9,2.3,3
6.7,3.3,5.7,2.5,3
6.7,3,5.2,2.3,3
6.3,2.5,5,1.9,3
6.5,3,5.2,2,3
6.2,3.4,5.4,2.3,3
5.9,3,5.1,1.8,3
1.tag
height
weight
lh
lw
species
 
编译后,这样执行:
id3.exe 1
 
去掉了一些帮助信息后,结果主要为:
Welcome to ID3
Last compiled on Jul 30 2011, 21:11:55
Read Matrix Finish!
neigentropy inactive
ID3 finish!
ID3 finish!
if { lh >= 3.00 then ON else OFF }
阅读(1465) | 评论(1) | 转发(0) |
给主人留下些什么吧!~~

tomorrow05302011-07-30 21:26:16

如果把12归为1类,3为1类;
把  if (data[target] >= 2)改为 if (data[target] >= 3)
得到下面的结果:

if { lh >= 4.80 then if { lw >= 1.80 then if { lh >= 4.90 then ON else if { height >= 6.00 then ON else OFF }  }  else if { lh >= 5.00 then if { lw >= 1.60 then if { height >= 7.00 then ON else OFF }  else ON }  else OFF }  }  else if { lw >= 1.70 then ON else OFF }  }