分享

ID3算法代码

 zhanghuan 2007-05-23

PROTO.H

NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );

void print_tree ( NODE* , CHAR** );

void free_tree ( NODE  * );

NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );

void err_exit ( CHAR* , UINT );

MATRIX *build_matrix ( UINT, UINT );

void free_matrix ( MATRIX * );

void read_matrix ( CHAR *, MATRIX * );

void file_size ( CHAR * , UINT * , UINT * );

CHAR **read_tags ( CHAR * , UINT );

void free_tags ( CHAR **, UINT);

 

ID3.h

typedef unsigned int  UINT;

typedef unsigned long ULONG;

typedef          char CHAR;

typedef unsigned char BOOL;

typedef double        REAL;

 

typedef struct node {

   UINT idx; /* ID code for attribute */

   REAL threshold; /* Numerical threshold for attribute test */

   struct node *on; /* Address of ‘on‘ node */

   struct node *off; /* Address of ‘off‘ node */

   struct node *parent; /* Addess of parent node */

} NODE;

 

typedef struct ne_struct {

    REAL ne;

    UINT status;

} NEGENTROPY;

 

typedef struct matrix {

   UINT width;

   UINT height;

   REAL **data;

} MATRIX;

 

enum UINT { INACTIVE, OFF, ON };

#define LN_2 0.693147180559945309417

#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)

 

/*

* FILE: id3.c

*

* Author: Andrew Colin

*

* DISCLAIMER: No liability is assumed by the author for any use made

* of this program.

*

* DISTRIBUTION: Any use may be made of this program, as long as the

* clear acknowledgment is made to the author in code and runtime

* executables

*/

 

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <float.h>

#include <limits.h>

#include <string.h>

#include <conio.h>

#include <time.h>

 

#include "id3.h"

#include "proto.h"

 

/*-------------------------------------------------------------------*/

 

MATRIX *build_matrix (UINT width, UINT height)

{

    MATRIX *_matrix;

    UINT i;

 

    _matrix = (MATRIX*) malloc (sizeof (MATRIX));

    if (!_matrix)

        err_exit (__FILE__, __LINE__);

 

    _matrix->width  = width;

    _matrix->height = height;

 

    _matrix->data = (REAL**) malloc (height * sizeof (REAL*));

    if (_matrix->data == NULL)

        err_exit(__FILE__, __LINE__);

 

    for (i=0; i<height; i++)

    {

        _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));

        if (_matrix->data[i] == NULL)

            err_exit(__FILE__, __LINE__);

    }

    return _matrix;

}

 

/*-------------------------------------------------------------------*/

 

/*

* Standard error handler function

*/

 

void err_exit (CHAR* file, UINT line)

{

    printf("\n Fatal error in file %s, line %u", file, line);

    exit(0);

}

 

/*-------------------------------------------------------------------*/

 

void file_size (CHAR *file_name, UINT *width, UINT *height)

/*

* Given the name of a file of numeric data, this routine counts

* the numbers of rows and columns. It‘s assumed that the number

* of entries is the same in each row, and an error is flagged if this

* is not the case.

*

*/

{

    FILE *f;

    UINT buf_size = 0xFF, _width = 0;

    CHAR *buffer, *ptr;

 

    *width = *height = 0;

 

    buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));

    if (buffer == NULL)

        err_exit (__FILE__, __LINE__);

 

    /* Open price file - abort if filename invalid */

    f = fopen(file_name, "r");

    if (f == NULL)

    {

        printf("\n File not found : %s\n", file_name);

        err_exit (__FILE__, __LINE__);

    }

 

    /* Get number of entries in first row */

    if (fgets(buffer, buf_size, f) != NULL)

    {

        ++*height;

        ptr = strtok (buffer, " ,");

        while (ptr != NULL)

        {

            ++*width;

            ptr = strtok (NULL, " ,");

        }

    }

 

    /* Count numbers of subsequent rows */

    while (!feof(f))

    {

        if (fgets(buffer, buf_size, f) != NULL)

        {

            if (strlen(buffer) > strlen("\n"))  /* if line is more than a NL char */

            {

                ++*height;

                _width = 0;

                ptr = strtok (buffer, " ,");

                while (ptr != NULL)

                {

                    ++_width;

                    ptr = strtok (NULL, " ,");

                }

 

                if (*width != _width)

                {

                    printf("\n Number of entries in file %s did not agree", file_name);

                    err_exit (__FILE__, __LINE__);

                }

            }

        }

    }

    free (buffer);

}


 NEGENTROPY negentropy ( REAL **data,

    UINT   n_samples,

    NODE   *local,

    UINT   target)

{

    /*

     * Calculates the entropy of classification of an attribute, given

     * a table of attributes already used, the attribute on which splitting

     * is to be taken, and the target attribute. Entropy is calculated in

     * bits, so logs are taken to base 2 by dividing by LN_2.

     *

     * The returned value always lies in the (closed) range [0, 1].

     */

 

    NEGENTROPY ret_val;

    NODE *_node, *_parent;

    UINT on_ctr, off_ctr, p1, p2, i, _match;

    REAL p_on, p_off, negentropy_on, negentropy_off;

 

    on_ctr = off_ctr = p1 = p2 = 0;

 

    /* Scan through all supplied data samples */

    for (i=0; i<n_samples; i++)

    {

 

        /*

         * If pattern matches the current position in the decision tree,

         * then use this vector. The match is determined by passing up

         * the decision tree and checking whether ‘data[idx] >= threshold‘

         * matches at each step, where idx and threshold are taken from

         * each node in turn.

         */

 

        _match = 1;

        _node = local;

        while (_node->parent != NULL)

        { /* If at the root node, all entries match*/

            _parent = _node->parent;

            if (_node == _parent->on)

            { /* if parent node is ON */

                if (data[i][_parent->idx] < _parent->threshold)

                    _match = 0;

            }

            else

                if (_node == _parent->off)

                { /* if parent node is OFF */

                    if (data[i][_parent->idx] >= _parent->threshold)

                        _match = 0;

                }

            _node = _parent;

        }

 

        if (_match)

        {

            if (data[i][local->idx] >= local->threshold)

            {

                on_ctr++;

                if (data[i][target] >= 0.5)

                    p1++;

            }

            else

            {

                off_ctr++;

                if (data[i][target] >= 0.5)

                    p2++;

            }

        }

    }   /* for (i=0; i<n_samples; i++) */

 

    /* 1: Entropy of subtable with activation ON */

 

    /*

     * We now have the numbers of samples that match this part of the

     * decision tree, and the number of samples for which the supplied

     * condition are true. From these quantities we can find the negentropy of

     * this partition.

     */

 

    if (on_ctr > 0)

    {

        p_on  = (REAL) p1 / (REAL) on_ctr;

        p_off = 1 - p_on;

        negentropy_on = -entropy (p_on) - entropy (p_off);

    }

    else

        negentropy_on = 0.0;

 

    /* 2: Entropy of subtable with activation OFF */

 

    if (off_ctr > 0)

    {

        p_on  = (REAL) p2 / (REAL) off_ctr;

        p_off = 1 - p_on;

        negentropy_off = -entropy (p_on) - entropy (p_off);

    }

    else

        negentropy_off = 0.0;

 

    ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);

    ret_val.ne /= (on_ctr + off_ctr);

 

    /*

     * If all values examined were the same, set ‘ret_val.status‘ to

     * the target value since this will be an end-of-branch node

     */

 

    if ((p1 == on_ctr) && (p2 == off_ctr))

        ret_val.status = ON;

    else if ((p1 == 0) && (p2 == 0))

        ret_val.status = OFF;

    else

        ret_val.status = INACTIVE;

 

    return ret_val;

 

}


void print_tree (NODE *node, CHAR** tag_names)

{

    /*

     * Displays algebraic equivalent of the decision tree

     */

 

    if (node->on == NULL)

    {

        if (node->idx == ON)

            printf("ON");

        else if (node->idx == OFF)

            printf("OFF");

        return;

    }

 

    else

        printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);

 

    print_tree ( node->on, tag_names );

    printf(" else ");

    print_tree ( node->off, tag_names );

    printf( " } " );

}

 

/*-------------------------------------------------------------------*/

 

void read_matrix (CHAR filename[], MATRIX *matrix)

 

{

 

    UINT i, j;

    UINT height = matrix->height;

    UINT width  = matrix->width;

    REAL **data = matrix->data;

    FILE *f;

 

    /* Open price file */

    if ((f = fopen(filename, "r")) == NULL)

    {

        printf("\n File not found : %s\n", filename);

        err_exit (__FILE__, __LINE__);

    }

 

    for (i=0; i<height; i++)

        for (j=0; j<width; j++)

        {

            fscanf(f, "%lf\n", &data[i][j] );

        }

 

    fclose(f);

 

}

 

/*-------------------------------------------------------------------*/

 

CHAR **read_tags (CHAR *tag_file, UINT width)

{

    FILE *f;

    CHAR **_varname;

    UINT i;

    CHAR buffer[0xFF];

 

    f = fopen(tag_file, "r");

    if (f == NULL)

    {

        printf("\n File not found : %s\n", tag_file);

        err_exit (__FILE__, __LINE__);

    }

 

    _varname = (CHAR**) malloc (width * sizeof (CHAR*));

    for (i=0; i<width; i++)

        _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));

 

    i = 0;

    while (!feof(f))

    {

        if (fgets(buffer, 0xFF, f) != NULL)

        {

            if (strlen(buffer) > strlen("\n"))

            {

                if (i>width-1)

                {

                    printf("\nMore variable names were detected than data items.");

                    printf("\nPlease correct this problem before proceeding");

                    exit(0);

                }

                sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);

                i++;

            }

        }

    }

 

    if (i<width)

    {

        printf("\nFewer variable names than data items were detected.");

        printf("\nPlease correct this problem before proceeding");

        exit(0);

    }

 

    fclose (f);

 

    return _varname;

 

}

 

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多