分享

huffman编码解码

 A_Geek 2014-01-09

/*

* huffman - Encode/Decode files using Huffman encoding.

* Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc

*

* This library is free software; you can redistribute it and/or

* modify it under the terms of the GNU Lesser General Public

* License as published by the Free Software Foundation; either

* version 2.1 of the License, or (at your option) any later version.

*

* This library is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

* Lesser General Public License for more details.

*

* You should have received a copy of the GNU Lesser General Public

* License along with this library; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/

/*

* 针对可打印字符的huffman编解码.

*/

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <assert.h>

#include "huffman.h"

#ifdef WIN32

#include <winsock2.h>

#include <winsock.h>

#include <malloc.h>

#define alloca _alloca

#else

#include <netinet/in.h>

#include <sys/socket.h>

#include <sys/types.h>

#endif

// ---------------------------------------------------------------------------------------

/* 描述huffman树的节点 */

typedef struct huffman_node_tag

{

unsigned char isLeaf; // 当前节点是否为叶子节点

unsigned long count; // (字符出现的次数)

struct huffman_node_tag *parent; // 当前节点的父亲

union {

/*

* 如果改节点为叶子节点, 则有unsigned char symbol字段,

* 如果该节点不是叶子节点, 则具有左孩子右孩子字段.

*/

struct {

struct huffman_node_tag *zero; // 左孩子

struct huffman_node_tag *one; // 右孩子

};

unsigned char symbol; // 存储的字符.

};

} huffman_node;

/* 编码 */

typedef struct huffman_code_tag

{

/*

* numbits 这个字段是用来记录从leaf--->root一共走了多少步.

* 也就是叶子节点所含字符的编码长度

* bits 是用来存储编码的, 但是它是以Byte为单位的, 而编码是

* bit为单位的, 所以需要根据 numbits 去从 bits 里提取出前

* numbits bit 这才是真正的编码.

*

* 比如

* numbits=4

* bits = 0110 1011

* 那么我们应该提取bits的低4 1011

*

* 注意 numbits bits是有关系的, bits一定不可能超过 numbits/8

*/

/* The length of this code in bits. */

unsigned long numbits;/* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */

/* The bits that make up this code. The first

bit is at position 0 in bits[0]. The second

bit is at position 1 in bits[0]. The eighth

bit is at position 7 in bits[0]. The ninth

bit is at position 0 in bits[1]. */

unsigned char *bits; /* 用来存储编码String */

} huffman_code;

int

main(int argc, char** argv)

{

/*

char memory = 0;

char compress = 1;

int opt;

const char *file_in = NULL;

const char *file_out = NULL;

*/

//FILE *in = stdin;

//FILE *out = stdout;

FILE *in = NULL;

FILE *out = NULL;

FILE *out2 = NULL;

in = fopen("test.txt", "r+");

out = fopen("test1.txt", "w+");

//out2 = fopen("test2.txt", "w+");

int result = 0;

result = huffman_encode_file(in, out);

//result = huffman_decode_file(out, out2);

printf("result : %d/n", result);

system("pause");

return 1;

}

// ---------------------------------------------------------------------------------------

/**

* bit-->byte转换, 不足8位则补齐8.

*/

static unsigned long

numbytes_from_numbits(unsigned long numbits)

{

return numbits / 8 + (numbits % 8 ? 1 : 0);

}

/*

* get_bit returns the ith bit in the bits array

* in the 0th position of the return value.

* 取出第i(从低位往高位排)

*/

static unsigned char

get_bit(unsigned char *bits, unsigned long i)

{

return ( bits[i/8] >> (i%8) ) & 1;

}

/**

* 反转数组的前((numbits/8)+1)个字符

*

*/

static void

reverse_bits(unsigned char* bits, unsigned long numbits)

{

unsigned long numbytes = numbytes_from_numbits(numbits); // 所占字节数.

unsigned char *tmp = (unsigned char*)calloc(numbytes, sizeof(unsigned char)); // 分配内存

unsigned long curbit; // index -- 当前位

long curbyte = 0; // 当前是byteindex

memset(tmp, 0, numbytes); // tmp指向的buffer清零

/*

*

*/

for(curbit=0; curbit<numbits; ++curbit) {

unsigned int bitpos = curbit % 8; // 当前byte里的index

if(curbit>0 && curbit%8==0) { //

++curbyte;

}

/*

* 按位 OR

*

*/

tmp[curbyte] |= ( get_bit(bits, numbits-curbit-1) << bitpos );

}

memcpy(bits, tmp, numbytes);

}

/*

* new_code builds a huffman_code from a leaf in

* a Huffman tree.

* 对指定的叶子进行编码.

*/

static huffman_code*

new_code(const huffman_node* leaf)

{

/* Build the huffman code by walking up to

* the root node and then reversing the bits,

* since the Huffman code is calculated by

* walking down the tree. */

unsigned long numbits = 0;

unsigned char *bits = NULL;

huffman_code *p;

while(leaf!=NULL && leaf->parent!=NULL) {

huffman_node *parent = leaf->parent; // 当前活动节点的双亲

unsigned long cur_byte = numbits / 8; // 当前byte

unsigned char cur_bit = (unsigned char)(numbits % 8); // 当前byte里的当前bit

/* If we need another byte to hold the code,

then allocate it. */

if(cur_bit == 0) { // 满了一个byte就需要重新分配内存

size_t newSize = cur_byte + 1;

bits = (char*)realloc(bits, newSize);

bits[newSize - 1] = 0; /* Initialize the new byte. */

}

/*

* ************************************************************************

* 需要注意的是右孩子的处理, 因为左孩子已经是0, 初始分配bits的时候就已经把每位都设置为0.

* ************************************************************************

*/

/* If a one must be added then or it in. If a zero

* must be added then do nothing, since the byte

* was initialized to zero. */

if(leaf == parent->one) {

/*

* 右孩子的话就需要把当前位置为1

*/

bits[cur_byte] |= (1<<cur_bit);

}

++numbits;

leaf = parent;

}

if( bits != 0) {

/*

* 如果编码里头含有1, 则需要进行反转, 如果全为0, 反转后和反转前是一样的, 就没必要反转了

*/

reverse_bits(bits, numbits);

}

p = (huffman_code*)malloc(sizeof(huffman_code));

p->numbits = numbits; /* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */

p->bits = bits; /* 用来存储编码的区间 */

return p;

}

/**

* 创建一个孤立的"叶子"节点, 该节点为一个单独的树

* symbol : 该叶子节点的权,即字符

*/

static huffman_node*

new_leaf_node(unsigned char symbol)

{

huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );

p->isLeaf = 1;

p->symbol = symbol;

p->count = 0;

p->parent = 0;

return p;

}

/**

* 创建节点(该节点不是叶子节点)

*

* count : 字符出现的次数

* zero : 左孩子

* one : 右兄弟

*

* return : 返回一个节点对象

*/

static huffman_node*

new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)

{

huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );

p->isLeaf = 0;

p->count = count;

p->zero = zero;

p->one = one;

p->parent = 0;

return p;

}

/**

* 释放树占用的内存空间

*/

static void

free_huffman_tree(huffman_node *subtree)

{

if(subtree == NULL)

return;

if( !(subtree->isLeaf) ) {

/*

* 先序遍历进行递归调用

*/

free_huffman_tree( subtree->zero );

free_huffman_tree( subtree->one );

}

free( subtree );

}

/**

* free code

*/

static void

free_code(huffman_code* p)

{

free(p->bits);

free(p);

}

// ----------------------------------------------------------------------------------------

#define MAX_SYMBOLS 256

typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS]; /* */

typedef huffman_code* SymbolEncoder[MAX_SYMBOLS]; /* */

/* */

static void

free_encoder(SymbolEncoder *pSE)

{

unsigned long i;

for(i = 0; i < MAX_SYMBOLS; ++i) {

huffman_code *p = (*pSE)[i];

if( p ) free_code(p);

}

}

/* */

static void

init_frequencies(SymbolFrequencies *pSF)

{

memset(*pSF, 0, sizeof(SymbolFrequencies) ); /* 清零 */

}

// ----------------------------------------------------------------------------------------

typedef struct buf_cache_tag

{

/*

* 该结构主要描述了两个部分, 一个是cache, 一个是bufout.

* cache是一个临时存储数据的buffer, cache会将数据写往bufout区间,

* bufout类似一个仓库, 会一直存储cache写入的数据.

* cache可以多次网bufout内写数据, bufout会一直保存这些数据.

* bufout是一个动态的buffer, cache每一次往bufout内写数据的时候bufout都需要realloc一次.

*/

unsigned char *cache; // 指向真正存储数据的buffer

unsigned int cache_len; // buffer的长度, 初始的时候就可以设置 cache的大小的

unsigned int cache_cur; // 数据结尾处(或者说是光标位置)

unsigned char **pbufout; /*

* cache要写数据就往这个空间内写(类似一个动态仓库, 一定是动态的)

* (*pbufout)就是真实的存储区

*/

unsigned int *pbufoutlen; // 仓库的大小

} buf_cache;

/* 初始化一个buf_cache */

static int init_cache(buf_cache *pc,

unsigned int cache_size,

unsigned char **pbufout,

unsigned int *pbufoutlen)

{

assert(pc && pbufout && pbufoutlen);

if(!pbufout || !pbufoutlen) return 1;

pc->cache = (unsigned char*)malloc(cache_size); // 分配存储空间

pc->cache_len = cache_size; //

pc->cache_cur = 0; // 光标从0开始

pc->pbufout = pbufout; //

*pbufout = NULL; //

pc->pbufoutlen = pbufoutlen;

*pbufoutlen = 0; //

return (pc->cache==NULL 0 : 1);

}

/* 释放buf_cache */

static void free_cache(buf_cache* pc)

{

assert( pc );

if( pc->cache != NULL)

{

/*

* 我觉得这里没有必要free( pc->cache );, 直接执行pc->cache = NULL;就可以保证

* 逻辑上清cache, 至于真实的存储区内是否仍然有数据并没有意义.

*/

free( pc->cache );

pc->cache = NULL;

}

}

/*

* cache内的数据写到pbufout中去, 并清洗cache

* 成功则返回0, 失败返回1.

*/

static int flush_cache(buf_cache* pc)

{

assert( pc );

if(pc->cache_cur > 0) // 确定cache_cur有平移, 这样才能确定cache内有数据. 才可以flush

{

/* 当前要写的数据长度为pc->cache_cur, 原来的bufout里头本身还有*(pc->pbufoutlen)长度的数据 */

unsigned int newlen = pc->cache_cur + *(pc->pbufoutlen);

/* 需要重新为*(pc->pbufout)分配空间, tmp为这个新buffer的首地址 */

unsigned char* tmp = realloc(*(pc->pbufout), newlen);

if( !tmp ) return 1;

/* 追加到pbufout结尾处, 而不是覆盖 */

memcpy(tmp + *(pc->pbufoutlen), pc->cache, pc->cache_cur);

*pc->pbufout = tmp; // pbufout指针重定位到新的扩大了的内存区.

*pc->pbufoutlen = newlen; // 重新计算pbufoutlen

pc->cache_cur = 0; // cache逻辑上清零

}

return 0;

}

/* cache */

static int write_cache(buf_cache* pc,

const void *to_write,

unsigned int to_write_len)

{

unsigned char* tmp;

assert(pc && to_write);

assert(pc->cache_len >= pc->cache_cur);

/* If trying to write more than the cache will hold

* flush the cache and allocate enough space immediately,

* that is, don't use the cache. */

if(to_write_len > pc->cache_len - pc->cache_cur)

{

/*

* to_write_len : 需要往cache内写的数据长度

* pc->cache_len-pc->cache_cur : cache内剩余空间

* 如果cache存储能力不够则先清洗cache, 再将数据直接写到

* pbufout中去, 不使用cache.

*/

unsigned int newlen;

flush_cache( pc );

newlen = *pc->pbufoutlen + to_write_len;

tmp = realloc(*pc->pbufout, newlen);

if( !tmp ) return 1;

memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);

*pc->pbufout = tmp;

*pc->pbufoutlen = newlen;

}

else

{

/*

* Write the data to the cache

* 如果cache存储能力足够,则往cache内追加数据且将当前光标移动到新的数据尾部.

*/

memcpy(pc->cache+pc->cache_cur, to_write, to_write_len);

pc->cache_cur += to_write_len;

}

return 0;

}

// -------------------------------------------------------------------------------------

/*

* 计算FILE对象内的各个字符出现的频率

* 扫描FILE对象,

*/

static unsigned int

get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)

{

int c;

unsigned int total_count = 0; // FILE对象内的字符总数

init_frequencies( pSF ); /* Set all frequencies to 0. */

/* Count the frequency of each symbol in the input file. */

while( (c=fgetc(in)) != EOF )

{

unsigned char uc = c;

if( !(*pSF)[uc] ) // 如果这个字符没有出现过则为这个字符建立一个叶子

{

/*

* 这里设计的相当有意思,

* fgetc会获得当前光标所在的字符, 这个字符必然是一个char, 也就是一个

* unsinged int型数.

* 我们前面有定义

* #define MAX_SYMBOLS 256

* typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];

* typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];

* 这里我们采用SymbolFrequencies[uc]就存储c这个字符.

* 也就是说

* SymbolFrequencies[0]==0x00

* SymbolFrequencies[1]==0x01

* ......

* SymbolFrequencies[65]==A ( A==65 :) )

* SymbolFrequencies[66]==B ( B==66 :) )

* ......

* 起初在typedf SymbolFrequencies的时候, 我着实没看懂, 实在太巧妙了.

*/

(*pSF)[uc] = new_leaf_node( uc );

}

/*

* 这个自加也非常巧妙, 遇到uc字符则将第uchuffman_nodecount自加

* 实际就是一个字符一个字符读下去,,

* 读到了A ++(pSF['A'].count) == ++(pSF[65].count)

* 读到了B ++(pSF['B'].count) == ++(pSF[66].count)

* TNND的牛X!

*/

++( (*pSF)[uc]->count );

++total_count;

}

return total_count;

}

/* 计算buffer内各个字符的频率,get_symbol_frequencies函数同理 */

static unsigned int

get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,

const unsigned char *bufin,

unsigned int bufinlen)

{

unsigned int i;

unsigned int total_count = 0;

/* Set all frequencies to 0. */

init_frequencies(pSF);

/* Count the frequency of each symbol in the input file. */

for(i = 0; i < bufinlen; ++i)

{

unsigned char uc = bufin[i];

if( !(*pSF)[uc] )

{

(*pSF)[uc] = new_leaf_node(uc);

}

++(*pSF)[uc]->count;

++total_count;

}

return total_count;

}

/*

* When used by qsort, SFComp sorts the array so that

* the symbol with the lowest frequency is first. Any

* NULL entries will be sorted to the end of the list.

*

* 两个huffman_node进行对比, count作为比较依据.

* 即对比两个不同 symbol 出现的频率

* 非叶子节点通通排在后面

*/

static int

SFComp(const void *p1, const void *p2)

{

const huffman_node *hn1 = *(const huffman_node**)p1;

const huffman_node *hn2 = *(const huffman_node**)p2;

/* Sort all NULLs to the end. */

if(hn1 == NULL && hn2 == NULL) return 0;

if(hn1 == NULL) return 1;

if(hn2 == NULL) return -1;

if(hn1->count > hn2->count) return 1;

else if(hn1->count < hn2->count) return -1;

return 0;

}

/*

* build_symbol_encoder builds a SymbolEncoder by walking

* down to the leaves of the Huffman tree and then,

* for each leaf, determines its code.

*

* 递归方式为每个叶子节点编码

*/

static void

build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSE)

{

if(subtree == NULL) return;

if( subtree->isLeaf )

{

(*pSE)[subtree->symbol] = new_code( subtree );

}

else

{

build_symbol_encoder(subtree->zero, pSE);

build_symbol_encoder(subtree->one, pSE);

}

}

/*

* calculate_huffman_codes turns pSF into an array

* with a single entry that is the root of the

* huffman tree. The return value is a SymbolEncoder,

* which is an array of huffman codes index by symbol value.

*

* 为每个node编码. 这个函数比较重要, 精华就是在这个函数里头的for循环. 哈哈

* 整个tree的建立全都依赖这个函数

*/

static SymbolEncoder*

calculate_huffman_codes(SymbolFrequencies * pSF)

{

unsigned int i = 0;

unsigned int n = 0;

huffman_node *m1 = NULL, *m2 = NULL;

SymbolEncoder *pSE = NULL;

/*

* Sort the symbol frequency array by ascending frequency.

* 快速排序例程进行排序

* symbol频率为关键字做升序排列

* symbol的节点都会按升序排列, 没有symbol的节点会统一排在后面,

* 通过一个for就能计算出symbol的个数了.

*/

qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);

/*

* Get the number of symbols.

* 计算huffman树中的字符数, 这个实现可读性不够好

*/

for(n = 0; (n<MAX_SYMBOLS) && (*pSF)[n]; ++n)

;

/*

* Construct a Huffman tree. This code is based

* on the algorithm given in Managing Gigabytes

* by Ian Witten et al, 2nd edition, page 34.

* Note that this implementation uses a simple

* count instead of probability.

*/

for(i = 0; i < (n-1); ++i)

{

/* Set m1 and m2 to the two subsets of least probability. */

m1 = (*pSF)[0];

m2 = (*pSF)[1];

/* Replace m1 and m2 with a set {m1, m2} whose probability

* is the sum of that of m1 and m2.

* 这个算法有优化的余地的, 因为n在一直减小.

* 将最小的两个元素合并后得到一个一个节点为m12, 此时m1,m2已经建立起来了关系.

* 这个m12的地址又被pSF[0]存储, 循环直至整个Tree建立成功.

* 指针在这里运用的实在太巧妙了.

* 这一行代码就是建树, 靠,NBA!

*/

(*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);

(*pSF)[1] = NULL;

/*

* Put newSet into the correct count position in pSF.

* 这里应该可以再进行优化, 是否有必要再进行排序, 或者被排序的数组过长了.

* 实际上每循环一次n都减少了一次

*/

qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);

}/* for完毕的时候就求出了root, pSF[0]就是root, 后面的元素都是NULL

* 而树通过for循环里头的

* (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);

* 已经建立完成了*/

/* Build the SymbolEncoder array from the tree. */

pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));

memset(pSE, 0, sizeof(SymbolEncoder));

build_symbol_encoder((*pSF)[0], pSE);

return pSE;

}

/*

* Write the huffman code table. The format is:

* 4 byte code count in network byte order.

* 4 byte number of bytes encoded

* (if you decode the data, you should get this number of bytes)

* code1

* ...

* codeN, where N is the count read at the begginning of the file.

* Each codeI has the following format:

* 1 byte symbol, 1 byte code bit length, code bytes.

* Each entry has numbytes_from_numbits code bytes.

* The last byte of each code may have extra bits, if the number of

* bits in the code is not a multiple of 8.

*

* 编码后的格式 :

* 0-3byteFILE内出现的不同字符个数(几不同的字符个数)

* 4-7byteFILE内出现的全部字符个数(所有的字符)

* 8-X是真正的编码后值

*

*/

static int

write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)

{

unsigned long i, count = 0;

/*

* Determine the number of entries in se

* 计算 SymbolEncoder 内具有编码值的元素个数.

* 即有几种字符

*/

for(i = 0; i < MAX_SYMBOLS; ++i)

if( (*se)[i] )

++count;

/*

* Write the number of entries in network byte order.

* 将字符种数写入到文件头部, [0, 3]一共4个字节

*/

//i = htonl( count );

i = count;

if(fwrite(&i, sizeof(i), 1, out) != 1) return 1;

/*

* Write the number of bytes that will be encoded.

* 将字符个数追加到[4,7]一共4个字节

*/

//symbol_count = htonl(symbol_count);

symbol_count = symbol_count;

if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) return 1;

/*

* Write the entries.

*/

for(i = 0; i < MAX_SYMBOLS; ++i)

{

huffman_code *p = (*se)[i];

if( p != NULL )

{ /*

* 每个单元分为三个部分 :

* symbol -- 字符

* numbits -- 叶子走到root需要的步数

* bits -- 叶子走到root的方式(即最终的编码, 比如说0101)

*/

unsigned int numbytes;

/* Write the 1 byte symbol. */

fputc((unsigned char)i, out);

/* Write the 1 byte code bit length. */

fputc(p->numbits, out);

/* Write the code bytes. 她这个注释就没有说是几byte, 值得思考一下 */

numbytes = numbytes_from_numbits( p->numbits );

/* 将叶子走到root的方式写进去, 这个方式会被整理为byte格式, 不够就补0 */

if(fwrite(p->bits, 1, numbytes, out) != numbytes) return 1;

}

}

return 0;

}

/*

* Allocates memory and sets *pbufout to point to it. The memory

* contains the code table.

*

* 以指定的格式将编码后的数据写入到cache中去, 实际是写到pbufout中去了.

*

*/

static int

write_code_table_to_memory(buf_cache *pc,

SymbolEncoder *se,

unsigned int symbol_count)

{

unsigned long i, count = 0;

/* Determine the number of entries in se. */

for(i = 0; i < MAX_SYMBOLS; ++i)

{

if((*se)[i])

{

++count; // 计算不同字符的个数

}

}

/* Write the number of entries in network byte order. */

//i = htonl(count);

i = count;

if( write_cache(pc, &i, sizeof(i)) ) // 前四个字节是memory内所有字符数

return 1;

/* Write the number of bytes that will be encoded. */

//symbol_count = htonl(symbol_count);

symbol_count = symbol_count;

if( write_cache(pc, &symbol_count, sizeof(symbol_count)) ) // 4-8字节是不同字符个数

return 1;

/* Write the entries. */

for(i = 0; i < MAX_SYMBOLS; ++i)

{

huffman_code *p = (*se)[i];

if( p )

{

/*

* 对于每次循环来说, 如果p不为NULL, 则将该字符对应的编码写入到cache.

* 存储格式为三个字节作为一个单位.

* byte0 --- 字符本身

* byte1 --- 该字符编码后的码值长度(2进制的位数)

* byte2 --- 该字符对应的码值

*/

unsigned int numbytes;

/*

* The value of i is < MAX_SYMBOLS (256), so it can

* be stored in an unsigned char.

* i转换为char, 可以对应到字符集

*/

unsigned char uc = (unsigned char)i;

/*

* Write the 1 byte symbol.

* 将字符写到cache

*/

if(write_cache(pc, &uc, sizeof(uc))) return 1;

/*

* Write the 1 byte code bit length.

* 将叶子节点到root所需要经过的步数写到cache, 也就是编码的长度

* 这个数据是为了解码使用的.

*/

uc = (unsigned char)p->numbits;

if(write_cache(pc, &uc, sizeof(uc))) return 1;

/*

* Write the code bytes.

* 将编码值对齐并写如到cache

* 事先必须知道编码由几位组成, 如果编码为9, 那么就需要2byte来存储这个码值

* 如果编码为4, 那么就需要1byte来存储了,

*/

numbytes = numbytes_from_numbits(p->numbits);

if(write_cache(pc, p->bits, numbytes)) return 1;

}

}

return 0;

}

/*

* read_code_table builds a Huffman tree from the code

* in the in file. This function returns NULL on error.

* The returned value should be freed with free_huffman_tree.

*

*

*/

static huffman_node*

read_code_table(FILE* in, unsigned int *pDataBytes)

{

huffman_node *root = new_nonleaf_node(0, NULL, NULL);

unsigned int count;

/*

* Read the number of entries.

* (it is stored in network byte order).

* 获得字符种数, 2byte就是出现的字符种数

*/

if( fread(&count, sizeof(count), 1, in) != 1 )

{

free_huffman_tree( root );

return NULL;

}

//count = ntohl(count);

count = count;

/*

* Read the number of data bytes this encoding represents.

* 一个有多少个字符

*/

if( fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1 )

{

free_huffman_tree(root);

return NULL;

}

//*pDataBytes = ntohl(*pDataBytes);

*pDataBytes = *pDataBytes;

/* Read the entries. */

while(count-- > 0)

{

int c;

unsigned int curbit;

unsigned char symbol;

unsigned char numbits;

unsigned char numbytes;

unsigned char *bytes;

huffman_node *p = root;

if( (c=fgetc(in)) == EOF )

{

free_huffman_tree( root );

return NULL;

}

symbol = (unsigned char)c;

if( (c=fgetc(in)) == EOF )

{

free_huffman_tree( root );

return NULL;

}

numbits = (unsigned char)c;

numbytes = (unsigned char)numbytes_from_numbits( numbits );

bytes = (unsigned char*)malloc( numbytes );

if( fread(bytes, 1, numbytes, in) != numbytes )

{

free(bytes);

free_huffman_tree(root);

return NULL;

}

/*

* Add the entry to the Huffman tree. The value

* of the current bit is used switch between

* zero and one child nodes in the tree. New nodes

* are added as needed in the tree.

*/

for(curbit = 0; curbit < numbits; ++curbit)

{

if(get_bit(bytes, curbit))

{

if(p->one == NULL)

{

p->one =

curbit == (unsigned char)(numbits-1) ?

new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);

p->one->parent = p;

}

p = p->one;

}

else

{

if(p->zero == NULL)

{

p->zero =

curbit == (unsigned char)(numbits - 1) ?

new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);

p->zero->parent = p;

}

p = p->zero;

}

}

free(bytes);

}

return root;

}

/*

* 将数据从buf读到bufout, 成功则返回0, 其他则返回1.

* pindex -- 拷贝的起点

*/

static int

memread(const unsigned char* buf,

unsigned int buflen,

unsigned int *pindex,

void* bufout,

unsigned int readlen)

{

assert(buf && pindex && bufout);

assert(buflen >= *pindex);

// 错误

if(buflen < *pindex) return 1;

if(readlen + *pindex >= buflen) return 1;

memcpy(bufout, buf + *pindex, readlen);

*pindex += readlen;

return 0;

}

/*

* 从编码后的buf内读数据.

*/

static huffman_node*

read_code_table_from_memory(const unsigned char* bufin,

unsigned int bufinlen,

unsigned int *pindex,

unsigned int *pDataBytes)

{

huffman_node *root = new_nonleaf_node(0, NULL, NULL);

unsigned int count;

/*

* Read the number of entries.

* (it is stored in network byte order).

* 读取

*/

if( memread(bufin, bufinlen, pindex, &count, sizeof(count)) )

{

free_huffman_tree(root);

return NULL;

}

//count = ntohl(count);

count = count;

/* Read the number of data bytes this encoding represents. */

if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))

{

free_huffman_tree(root);

return NULL;

}

//*pDataBytes = ntohl(*pDataBytes);

*pDataBytes = *pDataBytes;

/* Read the entries. */

while( (count--) > 0 )

{

unsigned int curbit;

unsigned char symbol;

unsigned char numbits;

unsigned char numbytes;

unsigned char *bytes;

huffman_node *p = root;

if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))

{

free_huffman_tree(root);

return NULL;

}

if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))

{

free_huffman_tree(root);

return NULL;

}

numbytes = (unsigned char)numbytes_from_numbits(numbits);

bytes = (unsigned char*)malloc(numbytes);

if(memread(bufin, bufinlen, pindex, bytes, numbytes))

{

free(bytes);

free_huffman_tree(root);

return NULL;

}

/*

* Add the entry to the Huffman tree. The value

* of the current bit is used switch between

* zero and one child nodes in the tree. New nodes

* are added as needed in the tree.

*/

for(curbit = 0; curbit < numbits; ++curbit)

{

if(get_bit(bytes, curbit))

{

if(p->one == NULL)

{

p->one = ( curbit==(unsigned char)(numbits - 1) ) ?

new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);

p->one->parent = p;

}

p = p->one;

}

else

{

if(p->zero == NULL)

{

p->zero = curbit == (unsigned char)(numbits - 1)

new_leaf_node(symbol)

: new_nonleaf_node(0, NULL, NULL);

p->zero->parent = p;

}

p = p->zero;

}

}

free(bytes);

}

return root;

}

/*

* 依次将各个字符的编码写入到out, 这次是直接写, 不对编码进行整齐工作

* 也就是不将编码强制为byte类型了, 而是直接写入到out.

*/

static int

do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)

{

unsigned char curbyte = 0;

unsigned char curbit = 0;

int c;

while( (c = fgetc(in)) != EOF)

{

//printf("c=%c /n", c);

unsigned char uc = (unsigned char)c;

huffman_code *code = (*se)[uc];

unsigned long i;

for(i = 0; i < code->numbits; ++i)

{

//printf("i=%d /n", i);

/* Add the current bit to curbyte. */

curbyte |= get_bit(code->bits, i) << curbit;

/* If this byte is filled up then write it

* out and reset the curbit and curbyte. */

if(++curbit == 8)

{

/*

* 依次将各个字符的编码写入到out, 这次是直接写, 不对编码进行整齐工作

* 也就是不将编码强制为byte类型了, 而是直接写入到out.

*/

fputc(curbyte, out);

printf("curbyte=%d/n", curbyte);

// printf("curbyte=%d /n", curbyte);

curbyte = 0;

curbit = 0;

}

}

printf("/n");

}

/*

* If there is data in curbyte that has not been

* output yet, which means that the last encoded

* character did not fall on a byte boundary,

* then output it.

*/

if(curbit > 0)

fputc(curbyte, out);

return 0;

}

/*

* memory进行编码.

*

* pc -- cache

* bufin -- 待编码的buffer

* bufinlen -- buffer, 即字符个数

* se --

*/

static int

do_memory_encode(buf_cache *pc,

const unsigned char *bufin,

unsigned int bufinlen,

SymbolEncoder *se)

{

unsigned char curbyte = 0; //

unsigned char curbit = 0;

unsigned int i;

/* bufin 内的字符依次循环 */

for(i = 0; i < bufinlen; ++i)

{

unsigned char uc = bufin[i];

huffman_code *code = (*se)[uc]; // 取出第i个字符的编码

unsigned long i;

/* 对第i个字符编码长度进行循环 */

for(i = 0; i < code->numbits; ++i)

{

/*

* Add the current bit to curbyte.

* 依次取出

*/

curbyte |= ( get_bit(code->bits, i) << curbit );

/*

* If this byte is filled up then write it

* out and reset the curbit and curbyte

*/

if(++curbit == 8)

{

/*

* 满了一个字节则写cache

*

*/

if(write_cache(pc, &curbyte, sizeof(curbyte))) return 1;

curbyte = 0;

curbit = 0;

}

}

}

/*

* If there is data in curbyte that has not been

* output yet, which means that the last encoded

* character did not fall on a byte boundary,

* then output it.

*/

return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;

}

/*

* huffman_encode_file huffman encodes in to out.

* FILE对象进行编码, *in编码后写入*out.

*/

int

huffman_encode_file(FILE *in, FILE *out)

{

SymbolFrequencies sf;

SymbolEncoder *se;

huffman_node *root = NULL;

int rc;

unsigned int symbol_count;

/*

* Get the frequency of each symbol in the input file.

* FILE中的数据写入到SymbolFrequencies, 计算出了各个字符出现频率,

* 也计算出了FILE对象内的总字符数.

*/

symbol_count = get_symbol_frequencies(&sf, in);

/*

* Build an optimal table from the symbolCount.

* 建树完毕, root就是sf[0], sf[1]=sf[2]=...sf[MAX_SYMBOLS]=NULL

*/

se = calculate_huffman_codes( &sf );

root = sf[0];

/*

* Scan the file again and, using the table

* previously built, encode it into the output file.

*/

rewind( in ); // 将文件指针重新指向一个流的开头

/*

* 将编码写如文件流

*/

rc = write_code_table(out, se, symbol_count);

if(rc == 0)

rc = do_file_encode(in, out, se);

/* Free the Huffman tree. */

free_huffman_tree( root );

free_encoder(se);

return rc;

}

/**

* FILE进行解码

*/

int

huffman_decode_file(FILE *in, FILE *out)

{

huffman_node *root;

huffman_node *p;

int c;

unsigned int data_count;

/* Read the Huffman code table. */

root = read_code_table(in, &data_count);

if( !root ) return 1;

/* Decode the file. */

p = root;

while(data_count>0 && (c=fgetc(in))!=EOF)

{

unsigned char byte = (unsigned char)c;

unsigned char mask = 1;

while(data_count > 0 && mask)

{

p = ( (byte&mask)? p->one : p->zero );

mask <<= 1;

if( p->isLeaf )

{

fputc(p->symbol, out);

p = root;

--data_count;

}

}

}

free_huffman_tree( root );

return 0;

}

// --------------------------------------------------------------------------------------

#define CACHE_SIZE 1024 /*

* memory编码需要考虑到数据源的速度和bufout的接收速度不匹配, cache

* cache是为了连接不同速度的设备而设计的

*/

/**

* buffer进行编码

*

*/

int huffman_encode_memory(const unsigned char *bufin,

unsigned int bufinlen,

unsigned char **pbufout,

unsigned int *pbufoutlen)

{

SymbolFrequencies sf;

SymbolEncoder *se;

huffman_node *root = NULL;

int rc;

unsigned int symbol_count; // memory中的字符个数

buf_cache cache; //

/* Ensure the arguments are valid. 检测参数合法性 */

if(!pbufout || !pbufoutlen) return 1;

if( init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen) ) return 1;

/*

* Get the frequency of each symbol in the input memory

* 计算bufin内各个字符出现的频率, 并求得bufin内存储的字符个数.

*/

symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);

/*

* Build an optimal table from the symbolCount.

* 为每个Node编码, 如果这个NodesymbolNULL, 则不编码了.

*/

se = calculate_huffman_codes( &sf );

root = sf[0]; // root来拉, 哈哈, 逻辑树出来了.

/*

* Scan the memory again and, using the table

* previously built, encode it into the output memory.

* se内的数据统统的写入到cache中克.

*/

rc = write_code_table_to_memory(&cache, se, symbol_count);

if(rc == 0)

{

/*

* 为什么write_code_table_to_memory成功之后还要要执行一次do_memory_encode?

*

*/

rc = do_memory_encode(&cache, bufin, bufinlen, se);

}

/* Flush the cache. */

flush_cache( &cache );

/* Free the Huffman tree. */

free_huffman_tree( root );

free_encoder( se );

free_cache( &cache );

return rc;

}

/**

* bufin进行解码. 将解码后的数据写入到bufout.

*/

int huffman_decode_memory(const unsigned char *bufin,

unsigned int bufinlen,

unsigned char **pbufout,

unsigned int *pbufoutlen)

{

huffman_node *root, *p;

unsigned int data_count;

unsigned int i = 0;

unsigned char *buf;

unsigned int bufcur = 0;

/* Ensure the arguments are valid. */

if(!pbufout || !pbufoutlen) return 1;

/* Read the Huffman code table. */

root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);

if(!root) return 1;

buf = (unsigned char*)malloc(data_count);

/* Decode the memory. */

p = root;

for(; i < bufinlen && data_count > 0; ++i)

{

unsigned char byte = bufin[i];

unsigned char mask = 1;

while(data_count > 0 && mask)

{

p = byte & mask ? p->one : p->zero;

mask <<= 1;

if(p->isLeaf)

{

buf[bufcur++] = p->symbol;

p = root;

--data_count;

}

}

}

free_huffman_tree(root);

*pbufout = buf;

*pbufoutlen = bufcur;

return 0;

}

/*

--------------------------------------------------------------------------------

不妨假设待编码的buffer "ABCAADC"

手工分析可得该树的形状 :

当然也可以也可以将这个树沿y方向翻转180

root

//

/ /

A *

//

/ /

C *

//

/ /

B D

现在我们知道的两个事实是 :

这个buffer内的字符数为 : symbol_count = 7 ( "ABCAADC"一个有7个字符 )

这个buffer内出现的字符种数为 : count = 4 ( 只出现了ABCD四种字符 )

接下来人工分析各个字符 :

symbol | count | bits

--------|-----------|---------------------

A | 3 | 0000 0000

B | 1 | 0000 0110

C | 2 | 0000 0010

D | 1 | 0000 0111

我们设置左边为0, 右边为1. bits为从叶子节点走到root的路径.

分析完毕后, 需要实现整个编码过程. 编码过程暂时跳过.

假设成功编码完毕, 需要把编码后的数据写入到bufout.

bufout

0-3byte为字符种数 count

4-7byte为字符个数 symbol_count

然后是遍历SymbolEncoder, 依次对每种字符进行编码(我们这个例子只进行4次编码)

我们对每种字符都会进行编码, 每个字符编码后的输出不妨称为frame

那么这个frame是由三个部分组成的:

(这个我们可以肯定一个char肯定是1byte)

symbol (1byte) --

(这个我们可以肯定就算这个树根本没有分支, 永远只有左/右孩子, 那也了不起是是256的深度)

numbits (1byte) -- 叶子走到root需要的步

bits (1byte) -- 叶子走到root的方式(即最终的编码, 比如说011)

开始我对这个bites到底占了多少个byte很是怀疑, 因为我不知道从叶子走到root

到底耗费了几步. 这里需要好好研究一下, 最好和最差情况. 暂时假设是个变化的byte.

但是有一点bitesnumbits是有关系的, 所以只要知道numbits还是可以知道bites占据

多少byte, 也知道bits到底是有几位.

byte content

---------------------------------------------

0-3(4) count

4-7(4) symbol_count

Axabyte frame_struct

Bxbbyte frame_struct

Cxcbyte frame_struct

Dxdbyte frame_struct

X X

这个Xdo_file_encode函数写到bufout中去的数据, 那么这个数据是什么呢

实际上它是循环的把出现的字符的bits写到bufout,

根据这个数据,解码的时候就可以依次的找到第0,1,2...个位置出现的是什么字符

--------------------------------------------------------------------------------

*/

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多