因为需要将统计表保存到文件中,以便解码时重建哈夫曼树,所以当数据量小时,可能压缩后的文件比原文件还大。哈夫曼编码适合用来压缩数据量大且字符出现频率高的文件。算法原理参见 哈夫曼树(Huffman Tree)
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace HuffmanTest
{
internal class Program
{
static void Main(string[] args)
{
//读取测试数据
string s = File.ReadAllText("test.txt");
//压缩
string base64 = HuffmanCoding.Compress(s);
File.WriteAllText("encode.txt", base64);
//解压
s = HuffmanCoding.Decompress(base64);
File.WriteAllText("decode.txt", s);
Console.ReadKey();
}
}
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
/// <summary>
/// 哈夫曼编码压缩算法
/// </summary>
public sealed class HuffmanCoding
{
//哈夫曼树节点
public sealed class Node : IComparable<Node>
{
//值
public byte value;
//出现频率
public int frequency;
public Node left;
public Node right;
public Node(byte value, int frequency)
{
this.value = value;
this.frequency = frequency;
}
public int CompareTo(Node other)
{
return frequency.CompareTo(other.frequency);
}
}
// 生成统计表
private static Dictionary<byte, Node> GenerateStatistics(byte[] data)
{
//统计不同字节出现的频率
var statistics = new Dictionary<byte, Node>();
foreach (byte b in data)
{
if (statistics.ContainsKey(b))
statistics[b].frequency++;
else
statistics.Add(b, new Node(b, 1));
}
return statistics;
}
// 生成哈夫曼编码表
private static Dictionary<byte, string> GenerateCodes(Node node, string code = "", Dictionary<byte, string> codeTable = null)
{
if (codeTable == null)
codeTable = new Dictionary<byte, string>();
//叶子节点
if (node.left == null && node.right == null)
{
codeTable[node.value] = code;
return codeTable;
}
//树的广度遍历
GenerateCodes(node.left, code + "0", codeTable);
GenerateCodes(node.right, code + "1", codeTable);
return codeTable;
}
// 生成啥夫曼树
private static Node GenerateTree(Dictionary<byte, Node> statistics)
{
List<Node> nodes = new List<Node>();
nodes.AddRange(statistics.Values);
nodes.Sort();
//构造哈夫曼树
while (nodes.Count > 1)
{
//取出频率最小的两个节点
var leftNode = nodes[0];
var rightNode = nodes[1];
nodes.RemoveRange(0, 2);
//构造一棵子树
var node = new Node(0, leftNode.frequency + rightNode.frequency);
node.left = leftNode;
node.right = rightNode;
//将子树加入列表并重新排序
nodes.Add(node);
nodes.Sort();
}
//哈夫曼树的根节点
var rootNode = nodes[0];
return rootNode;
}
// 压缩数据, 返回 base64
public static string Compress(string s)
{
byte[] bytes = Encoding.UTF8.GetBytes(s);
byte[] encodeData = HuffmanCoding.Compress(bytes);
string base64 = Convert.ToBase64String(encodeData);
return base64;
}
// 压缩数据
public static byte[] Compress(byte[] data)
{
if (data == null)
return data;
//频率统计表
var statistics = GenerateStatistics(data);
//哈夫曼树根节点
var rootNode = GenerateTree(statistics);
//哈夫曼编码表
var codeTable = GenerateCodes(rootNode);
PrintCodeTable("code_table.txt", codeTable);
//生成压缩数据
byte[] encodeData = Encode(statistics, codeTable, data);
return encodeData;
}
// 解压数据
public static string Decompress(string base64)
{
byte[] bytes = Convert.FromBase64String(base64);
byte[] decodeData = HuffmanCoding.Decompress(bytes);
return Encoding.UTF8.GetString(decodeData);
}
// 解压数据
public static byte[] Decompress(byte[] data)
{
if (data == null)
return data;
MemoryStream ms = new MemoryStream(data);
BinaryReader br = new BinaryReader(ms);
//频率统计表
var statistics = ReadStatistics(br);
//哈夫曼树根节点
var rootNode = GenerateTree(statistics);
//解码数据
var decodeData = Decode(br, rootNode);
return decodeData;
}
// 从内存流读统计表
private static Dictionary<byte, Node> ReadStatistics(BinaryReader br)
{
var statistics = new Dictionary<byte, Node>();
//读统计表长度
int length = br.ReadInt32();
for (int i = 0; i < length; i++)
{
byte b = br.ReadByte();
int f = br.ReadInt32();
var node = new Node(b, f);
statistics.Add(b, node);
}
return statistics;
}
// 解码内容
private static byte[] Decode(BinaryReader br, Node rootNode)
{
//内容长度
int length = br.ReadInt32();
//压缩数据
var data = br.ReadBytes(length);
//数据解码
Node node = rootNode;
List<byte> result = new List<byte>();
BitArray bits = new BitArray(data);
for (int i = 0; i < bits.Length; i++)
{
bool b = bits[i];
node = b ? node.right : node.left;
//如果是叶子节点
if (node.left == null && node.right == null)
{
result.Add(node.value);
//继续从根节点查找叶子
node = rootNode;
//内容已全部解码
if (result.Count >= length)
break;
}
}
return result.ToArray();
}
// 写统计表到内存流
private static void WriteStatistics(Dictionary<byte, Node> statistics, BinaryWriter bw)
{
//写入统计表长度
bw.Write(statistics.Count);
foreach (byte k in statistics.Keys)
{
//字节
bw.Write(k);
//出现的频率
bw.Write(statistics[k].frequency);
}
}
// 写内容到内存流
private static void WriteData(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data, BinaryWriter bw)
{
//写入内容长度
bw.Write(data.Length);
//计算需要用多少bit来存储经哈夫曼编码后的数据
int bitCount = 0;
foreach (var b in statistics.Keys)
{
int frequence = statistics[b].frequency;
string code = codeTable[b];
bitCount += code.Length * frequence;
}
//将数据的哈夫曼编码存储到BitArray中
BitArray bitArray = new BitArray(bitCount);
int i = 0;
foreach (var b in data)
{
string code = codeTable[b];
foreach (var c in code)
{
bitArray.Set(i++, c.Equals('1'));
}
}
//将BitArray中的数据复制到buffer中
int size = bitArray.Count / 8;
if (bitArray.Count % 8 > 0)
size++;
byte[] buffer = new byte[size];
bitArray.CopyTo(buffer, 0);
//将编码后的数据写入内存流
bw.Write(buffer);
}
// 生成编码后的数据
private static byte[] Encode(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data)
{
MemoryStream ms = new MemoryStream();
BinaryWriter bw = new BinaryWriter(ms);
WriteStatistics(statistics, bw);
WriteData(statistics, codeTable, data, bw);
byte[] bytes = ms.ToArray();
return bytes;
}
// 输出编码表
private static void PrintCodeTable(string filePath, Dictionary<byte, string> codeTable)
{
StringBuilder sb = new StringBuilder();
foreach (var b in codeTable.Keys)
{
sb.AppendLine(string.Format("{0}: {1}", (char)b, codeTable[b]));
}
File.WriteAllText(filePath, sb.ToString());
}
}