PROTO.H NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT ); void print_tree ( NODE* , CHAR** ); void free_tree ( NODE * ); NODE* ID3 ( MATRIX * , NODE* , UINT , UINT ); void err_exit ( CHAR* , UINT ); MATRIX *build_matrix ( UINT, UINT ); void free_matrix ( MATRIX * ); void read_matrix ( CHAR *, MATRIX * ); void file_size ( CHAR * , UINT * , UINT * ); CHAR **read_tags ( CHAR * , UINT ); void free_tags ( CHAR **, UINT); ID3.h typedef unsigned int UINT; typedef unsigned long ULONG; typedef char CHAR; typedef unsigned char BOOL; typedef double REAL; typedef struct node { UINT idx; /* ID code for attribute */ REAL threshold; /* Numerical threshold for attribute test */ struct node *on; /* Address of ‘on‘ node */ struct node *off; /* Address of ‘off‘ node */ struct node *parent; /* Addess of parent node */ } NODE; typedef struct ne_struct { REAL ne; UINT status; } NEGENTROPY; typedef struct matrix { UINT width; UINT height; REAL **data; } MATRIX; enum UINT { INACTIVE, OFF, ON }; #define LN_2 0.693147180559945309417 #define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0) /* * FILE: id3.c * * Author: Andrew Colin * * DISCLAIMER: No liability is assumed by the author for any use made * of this program. * * DISTRIBUTION: Any use may be made of this program, as long as the * clear acknowledgment is made to the author in code and runtime * executables */ #include <stdio.h> #include <stdlib.h> #include <math.h> #include <float.h> #include <limits.h> #include <string.h> #include <conio.h> #include <time.h> #include "id3.h" #include "proto.h" /*-------------------------------------------------------------------*/ MATRIX *build_matrix (UINT width, UINT height) { MATRIX *_matrix; UINT i; _matrix = (MATRIX*) malloc (sizeof (MATRIX)); if (!_matrix) err_exit (__FILE__, __LINE__); _matrix->width = width; _matrix->height = height; _matrix->data = (REAL**) malloc (height * sizeof (REAL*)); if (_matrix->data == NULL) err_exit(__FILE__, __LINE__); for (i=0; i<height; i++) { _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL)); if (_matrix->data[i] == NULL) err_exit(__FILE__, __LINE__); } return _matrix; } /*-------------------------------------------------------------------*/ /* * Standard error handler function */ void err_exit (CHAR* file, UINT line) { printf("\n Fatal error in file %s, line %u", file, line); exit(0); } /*-------------------------------------------------------------------*/ void file_size (CHAR *file_name, UINT *width, UINT *height) /* * Given the name of a file of numeric data, this routine counts * the numbers of rows and columns. It‘s assumed that the number * of entries is the same in each row, and an error is flagged if this * is not the case. * */ { FILE *f; UINT buf_size = 0xFF, _width = 0; CHAR *buffer, *ptr; *width = *height = 0; buffer = (CHAR*) malloc (buf_size * sizeof (CHAR)); if (buffer == NULL) err_exit (__FILE__, __LINE__); /* Open price file - abort if filename invalid */ f = fopen(file_name, "r"); if (f == NULL) { printf("\n File not found : %s\n", file_name); err_exit (__FILE__, __LINE__); } /* Get number of entries in first row */ if (fgets(buffer, buf_size, f) != NULL) { ++*height; ptr = strtok (buffer, " ,"); while (ptr != NULL) { ++*width; ptr = strtok (NULL, " ,"); } } /* Count numbers of subsequent rows */ while (!feof(f)) { if (fgets(buffer, buf_size, f) != NULL) { if (strlen(buffer) > strlen("\n")) /* if line is more than a NL char */ { ++*height; _width = 0; ptr = strtok (buffer, " ,"); while (ptr != NULL) { ++_width; ptr = strtok (NULL, " ,"); } if (*width != _width) { printf("\n Number of entries in file %s did not agree", file_name); err_exit (__FILE__, __LINE__); } } } } free (buffer); } NEGENTROPY negentropy ( REAL **data, UINT n_samples, NODE *local, UINT target) { /* * Calculates the entropy of classification of an attribute, given * a table of attributes already used, the attribute on which splitting * is to be taken, and the target attribute. Entropy is calculated in * bits, so logs are taken to base 2 by dividing by LN_2. * * The returned value always lies in the (closed) range [0, 1]. */ NEGENTROPY ret_val; NODE *_node, *_parent; UINT on_ctr, off_ctr, p1, p2, i, _match; REAL p_on, p_off, negentropy_on, negentropy_off; on_ctr = off_ctr = p1 = p2 = 0; /* Scan through all supplied data samples */ for (i=0; i<n_samples; i++) { /* * If pattern matches the current position in the decision tree, * then use this vector. The match is determined by passing up * the decision tree and checking whether ‘data[idx] >= threshold‘ * matches at each step, where idx and threshold are taken from * each node in turn. */ _match = 1; _node = local; while (_node->parent != NULL) { /* If at the root node, all entries match*/ _parent = _node->parent; if (_node == _parent->on) { /* if parent node is ON */ if (data[i][_parent->idx] < _parent->threshold) _match = 0; } else if (_node == _parent->off) { /* if parent node is OFF */ if (data[i][_parent->idx] >= _parent->threshold) _match = 0; } _node = _parent; } if (_match) { if (data[i][local->idx] >= local->threshold) { on_ctr++; if (data[i][target] >= 0.5) p1++; } else { off_ctr++; if (data[i][target] >= 0.5) p2++; } } } /* for (i=0; i<n_samples; i++) */ /* 1: Entropy of subtable with activation ON */ /* * We now have the numbers of samples that match this part of the * decision tree, and the number of samples for which the supplied * condition are true. From these quantities we can find the negentropy of * this partition. */ if (on_ctr > 0) { p_on = (REAL) p1 / (REAL) on_ctr; p_off = 1 - p_on; negentropy_on = -entropy (p_on) - entropy (p_off); } else negentropy_on = 0.0; /* 2: Entropy of subtable with activation OFF */ if (off_ctr > 0) { p_on = (REAL) p2 / (REAL) off_ctr; p_off = 1 - p_on; negentropy_off = -entropy (p_on) - entropy (p_off); } else negentropy_off = 0.0; ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr); ret_val.ne /= (on_ctr + off_ctr); /* * If all values examined were the same, set ‘ret_val.status‘ to * the target value since this will be an end-of-branch node */ if ((p1 == on_ctr) && (p2 == off_ctr)) ret_val.status = ON; else if ((p1 == 0) && (p2 == 0)) ret_val.status = OFF; else ret_val.status = INACTIVE; return ret_val; } void print_tree (NODE *node, CHAR** tag_names) { /* * Displays algebraic equivalent of the decision tree */ if (node->on == NULL) { if (node->idx == ON) printf("ON"); else if (node->idx == OFF) printf("OFF"); return; } else printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold); print_tree ( node->on, tag_names ); printf(" else "); print_tree ( node->off, tag_names ); printf( " } " ); } /*-------------------------------------------------------------------*/ void read_matrix (CHAR filename[], MATRIX *matrix) { UINT i, j; UINT height = matrix->height; UINT width = matrix->width; REAL **data = matrix->data; FILE *f; /* Open price file */ if ((f = fopen(filename, "r")) == NULL) { printf("\n File not found : %s\n", filename); err_exit (__FILE__, __LINE__); } for (i=0; i<height; i++) for (j=0; j<width; j++) { fscanf(f, "%lf\n", &data[i][j] ); } fclose(f); } /*-------------------------------------------------------------------*/ CHAR **read_tags (CHAR *tag_file, UINT width) { FILE *f; CHAR **_varname; UINT i; CHAR buffer[0xFF]; f = fopen(tag_file, "r"); if (f == NULL) { printf("\n File not found : %s\n", tag_file); err_exit (__FILE__, __LINE__); } _varname = (CHAR**) malloc (width * sizeof (CHAR*)); for (i=0; i<width; i++) _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR)); i = 0; while (!feof(f)) { if (fgets(buffer, 0xFF, f) != NULL) { if (strlen(buffer) > strlen("\n")) { if (i>width-1) { printf("\nMore variable names were detected than data items."); printf("\nPlease correct this problem before proceeding"); exit(0); } sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]); i++; } } } if (i<width) { printf("\nFewer variable names than data items were detected."); printf("\nPlease correct this problem before proceeding"); exit(0); } fclose (f); return _varname; }
|
|