C#高手之路详解解析Hashtable、Dictionary、SortedDictiona...

kittywei 2011-05-03

展开全文

http://www./cshare/show-6804-6.aspx
本文和大家一起来学习下解析Hashtable、Dictionary、SortedDictionary、SortedList的比较应用。
下面深入地分析如题的4个字典的原理。
我们先看Hashtable。
MSDN的解释：表示键/值对的集合，这些键/值对根据键的哈希代码进行组织。
Hash算法是把任意长度的输入（又叫做预映射， pre-image），通过散列算法，变换成固定长度的输出，该输出就是散列值。这种转换是一种压缩映射，也就是，散列值的空间通常远小于输入的空间，不同的输入可能会散列成相同的输出，而不可能从散列值来唯一的确定输入值。
Hashtable 对象由包含集合元素的存储桶组成。存储桶是 Hashtable 中各元素的虚拟子组，与大多数集合中进行的搜索和检索相比，存储桶可令搜索和检索更为便捷。每一存储桶都与一个哈希代码关联，该哈希代码是使用哈希函数生成的并基于该元素的键。
Hashtable 类默认的装填因子是 1.0，但实际上它默认的装填因子是 0.72。所有从构造函数输入的装填因子，Hashtable 类内部都会将其乘以0.72。这是一个要求苛刻的数字, 某些时刻将装填因子增减 0.01, 可能你的 Hashtable 存取效率就提高或降低了 50%，其原因是装填因子决定散列表容量，而散列表容量又影响 Key 的冲突几率，进而影响性能。0.72 是 Microsoft经过大量实验得出的一个比较平衡的值。
我们看Hashtable的一些源码：
Hashtable .ctor [http://www.]public Hashtable() : this(0, (float) 1f) { } public Hashtable(int capacity, float loadFactor) { if (capacity < 0) { throw new ArgumentOutOfRangeException("capacity", Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); } if ((loadFactor < 0.1f) || (loadFactor > 1f)) { throw new ArgumentOutOfRangeException("loadFactor", Environment.GetResourceString("ArgumentOutOfRange_HashtableLoadFactor", new object[] { 0.1, 1.0 })); } this.loadFactor = 0.72f * loadFactor; double num = ((float) capacity) / this.loadFactor; if (num > 2147483647.0) { throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow")); } int num2 = (num > 11.0) ? HashHelpers.GetPrime((int) num) : 11; this.buckets = new bucket[num2]; this.loadsize = (int) (this.loadFactor * num2); this.isWriterInProgress = false; }Hashtable 扩容是个耗时非常惊人的内部操作，它之所以写入效率仅为读取效率的 1/10 数量级，频繁的扩容是一个因素。当进行扩容时，散列表内部要重新 new 一个更大的数组，然后把原来数组的内容拷贝到新数组，并进行重新散列。如何 new这个更大的数组也有讲究。散列表的初始容量一般来讲是个素数。当扩容时，新数组的大小会设置成原数组双倍大小的相近的一个素数。

文章来自学IT网：http://www./html/2010-05/21-9450840172010523153218218.html

Hashtable expand [http://www.]private void expand() { int prime = HashHelpers.GetPrime(this.buckets.Length * 2); this.rehash(prime); } private void rehash(int newsize) { this.occupancy = 0; Hashtable.bucket[] newBuckets = new Hashtable.bucket[newsize]; for (int i = 0; i < this.buckets.Length; i ) { Hashtable.bucket bucket = this.buckets[i]; if ((bucket.key != null) && (bucket.key != this.buckets)) { this.putEntry(newBuckets, bucket.key, bucket.val, bucket.hash_coll & 0x7fffffff); } } Thread.BeginCriticalRegion(); this.isWriterInProgress = true; this.buckets = newBuckets; this.loadsize = (int) (this.loadFactor * newsize); this.UpdateVersion(); this.isWriterInProgress = false; Thread.EndCriticalRegion(); }HashTable数据结构存在问题：空间利用率偏低、受填充因子影响大、扩容时所有的数据需要重新进行散列计算。虽然Hash具有O(1)的数据检索效率，但它空间开销却通常很大，是以空间换取时间。所以Hashtable适用于读取操作频繁，写入操作很少的操作类型。

文章来自学IT网：http://www./cshare/show-6804-2.aspx

而Dictionary<K, V> 也是用的Hash算法，通过数组实现多条链式结构。不过它是采用分离链接散列法。采用分离链接散列法不受到装填因子的影响，扩容时原有数据不需要重新进行散列计算。
采用分离链接法的 Dictionary<TKey, TValue> 会在内部维护一个链表数组。对于这个链表数组 L0,L1,...，LM-1, 散列函数将告诉我们应当把元素 X 插入到链表的什么位置。然后在 find 操作时告诉我们哪一个表中包含了 X。这种方法的思想在于：尽管搜索一个链表是线性操作，但如果表足够小，搜索非常快(事实也的确如此，同时这也是查找，插入，删除等操作并非总是 O(1) 的原因)。特别是，它不受装填因子的限制。
这种情况下，常见的装填因子是 1.0。更低的装填因子并不能明显的提高性能，但却需要更多的额外空间。
Dictionary .ctor [http://www.]public Dictionary() : this(0, null) { } public Dictionary(int capacity, IEqualityComparer<TKey> comparer) { if (capacity < 0) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity); } if (capacity > 0) { this.Initialize(capacity); } if (comparer == null) { comparer = EqualityComparer<TKey>.Default; } this.comparer = comparer; } private void Resize() { int prime = HashHelpers.GetPrime(this.count * 2); int[] numArray = new int[prime]; for (int i = 0; i < numArray.Length; i ) { numArray[i] = -1; } Entry<TKey, TValue>[] destinationArray = new Entry<TKey, TValue>[prime]; Array.Copy(this.entries, 0, destinationArray, 0, this.count); for (int j = 0; j < this.count; j ) { int index = destinationArray[j].hashCode % prime; destinationArray[j].next = numArray[index]; numArray[index] = j; } this.buckets = numArray; this.entries = destinationArray; }Dictionary的插入算法：1、计算key的hash值，并且找到buckets中目标桶的链首索引，2、从链上依次查找是否key已经保存，3、如果没有的话，判断是否存在freeList，4、如果存在freeList，从freeList上摘下结点保存数据，否则追加在count位置上。

文章来自学IT网：http://www./cshare/show-6804-3.aspx

Dictionary Add [http://www.]private void Insert(TKey key, TValue value, bool add) { int freeList; if (key == null) { ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key); } if (this.buckets == null) { this.Initialize(0); } int num = this.comparer.GetHashCode(key) & 0x7fffffff; int index = num % this.buckets.Length; for (int i = this.buckets[index]; i >= 0; i = this.entries[i].next) { if ((this.entries[i].hashCode == num) && this.comparer.Equals(this.entries[i].key, key)) { if (add) { ThrowHelper.ThrowArgumentException(ExceptionResource.Argument_AddingDuplicate); } this.entries[i].value = value; this.version ; return; } } if (this.freeCount > 0) { freeList = this.freeList; this.freeList = this.entries[freeList].next; this.freeCount--; } else { if (this.count == this.entries.Length) { this.Resize(); index = num % this.buckets.Length; } freeList = this.count; this.count ; } this.entries[freeList].hashCode = num; this.entries[freeList].next = this.buckets[index]; this.entries[freeList].key = key; this.entries[freeList].value = value; this.buckets[index] = freeList; this.version ; }buckets数组保存所有数据链的链首，Buckets[i]表示在桶i中数据链的链首元素。entries结构体数组用于保存实际的数据，通过next值作为链式结构的向后索引。删除的数据空间会被串入到freeList链表的首部，当再次插入数据时，会首先查找freeList链表，以提高查找entries中空闲数据项位置的效率。在枚举器中，枚举顺序为entries数组的下标递增顺序。
Dictionary Remove [http://www.]public bool Remove(TKey key) { if (key == null) { ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key); } if (this.buckets != null) { int num = this.comparer.GetHashCode(key) & 0x7fffffff; int index = num % this.buckets.Length; int num3 = -1; for (int i = this.buckets[index]; i >= 0; i = this.entries[i].next) { if ((this.entries[i].hashCode == num) && this.comparer.Equals(this.entries[i].key, key)) { if (num3 < 0) { this.buckets[index] = this.entries[i].next; } else { this.entries[num3].next = this.entries[i].next; } this.entries[i].hashCode = -1; this.entries[i].next = this.freeList; this.entries[i].key = default(TKey); this.entries[i].value = default(TValue); this.freeList = i; this.freeCount ; this.version ; return true; } num3 = i; } } return false; } 而SortedDictionary，MSDN是这样描述的：
SortedDictionary<(Of <(TKey, TValue>)>) 泛型类是检索运算复杂度为 O(log n) 的二叉搜索树，其中 n 是字典中的元素数。就这一点而言，它与 SortedList<(Of <(TKey, TValue>)>) 泛型类相似。这两个类具有相似的对象模型，并且都具有 O(log n) 的检索运算复杂度。这两个类的区别在于内存的使用以及插入和移除元素的速度：

文章来自学IT网：http://www./cshare/show-6804-4.aspx

SortedList<(Of <(TKey, TValue>)>) 使用的内存比 SortedDictionary<(Of <(TKey, TValue>)>) 少。
SortedDictionary<(Of <(TKey, TValue>)>) 可对未排序的数据执行更快的插入和移除操作：它的时间复杂度为 O(log n)，而 SortedList<(Of <(TKey, TValue>)>) 为 O(n)。
如果使用排序数据一次性填充列表，则 SortedList<(Of <(TKey, TValue>)>) 比 SortedDictionary<(Of <(TKey, TValue>)>) 快。
SortedDictionary<K, V>是按照K有序排列的(K, V)数据结构，以红黑树作为内部数据结构对K进行排列保存– TreeSet<T>，红黑树是一棵二叉搜索树，每个结点具有黑色或者红色的属性。它比普通的二叉搜索树拥有更好的平衡性。2-3-4树是红黑树在“理论”上的数据结构。
2-3-4树插入算法：类似于二叉搜索树的插入（插入数据插入到树的叶子结点），如果插入位置是2-结点或者3-结点，那么直接插入到当前结点，如果插入位置是4-结点，需要将当前的4-结点进行拆分，然后再执行后继的插入操作。
SortedDictionary Add [http://www.]public void Add(T item) { if (this.root == null) { this.root = new Node<T>(item, false); this.count = 1; } else { Node<T> root = this.root; Node<T> node = null; Node<T> grandParent = null; Node<T> greatGrandParent = null; int num = 0; while (root != null) { num = this.comparer.Compare(item, root.Item); if (num == 0) { this.root.IsRed = false; ThrowHelper.ThrowArgumentException(ExceptionResource.Argument_AddingDuplicate); } if (TreeSet<T>.Is4Node(root)) { TreeSet<T>.Split4Node(root); if (TreeSet<T>.IsRed(node)) { this.InsertionBalance(root, ref node, grandParent, greatGrandParent); } } greatGrandParent = grandParent; grandParent = node; node = root; root = (num < 0) ? root.Left : root.Right; } Node<T> current = new Node<T>(item); if (num > 0) { node.Right = current; } else { node.Left = current; } if (node.IsRed) { this.InsertionBalance(current, ref node, grandParent, greatGrandParent); } this.root.IsRed = false; this.count ; this.version ; } }我们来测试一下Hashtable、Dictionary和SortedDictionary的插入和查找性能。

文章来自学IT网：http://www./cshare/show-6804-5.aspx

性能测试代码 [http://www.]using System; using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Linq; namespace DictionaryTest { class Program { private static int totalCount = 10000; static void Main(string[] args) { HashtableTest(); DictionaryTest(); SortedDictionaryTest(); Console.ReadKey(); } private static void HashtableTest() { Hashtable hastable = new Hashtable(); Stopwatch watch = new Stopwatch(); watch.Start(); for (int i = 1; i < totalCount; i ) { hastable.Add(i, 0); } watch.Stop(); Console.WriteLine(string.Format("Hashtable添加{0}个元素耗时：{1}ms",totalCount, watch.ElapsedMilliseconds)); Console.WriteLine("Hashtable不做查找测试"); hastable.Clear(); } private static void DictionaryTest() { Dictionary<int, int> dictionary = new Dictionary<int, int>(); Stopwatch watch = new Stopwatch(); watch.Start(); for (int i = 1; i < totalCount; i ) { dictionary.Add(i, 0); } watch.Stop(); Console.WriteLine(string.Format("Dictionary添加{0}个元素耗时：{1}ms",totalCount, watch.ElapsedMilliseconds)); watch.Reset(); watch.Start(); dictionary.Select(o => o.Key % 1000 == 0).ToList().ForEach(o => { }); watch.Stop(); Console.WriteLine(string.Format("Dictionary查找能被1000整除的元素耗时：{0}ms", watch.ElapsedMilliseconds)); dictionary.Clear(); } private static void SortedDictionaryTest() { SortedDictionary<int, int> dictionary = new SortedDictionary<int, int>(); Stopwatch watch = new Stopwatch(); watch.Start(); for (int i = 1; i < totalCount; i ) { dictionary.Add(i, 0); } watch.Stop(); Console.WriteLine(string.Format("SortedDictionary添加{0}个元素耗时：{1}ms",totalCount, watch.ElapsedMilliseconds)); watch.Reset(); watch.Start(); dictionary.Select(o => o.Key % 1000 == 0).ToList().ForEach(o => { }); watch.Stop(); Console.WriteLine(string.Format("SortedDictionary查找能被1000整除的元素耗时：{0}ms", watch.ElapsedMilliseconds)); dictionary.Clear(); } } }最终结果如图：

文章来自学IT网：http://www./cshare/show-6804-6.aspx
文章来自学IT网：http://www./cshare/show-6804-6.aspx