Use Huffman compression with a fixed tree to lower data usage.

This commit is contained in:
2017-07-09 23:24:24 +02:00
parent b969da80f2
commit 044f026365
4 changed files with 339 additions and 4 deletions

View File

@@ -1,4 +1,6 @@
namespace VAR.UrlCompressor using System;
namespace VAR.UrlCompressor
{ {
static class ByteExtensions static class ByteExtensions
{ {
@@ -28,5 +30,17 @@
bytes[bytePosition] = (byte)(bytes[bytePosition] & (0xffffffff - (0x1 << (7 - bitPosition)))); bytes[bytePosition] = (byte)(bytes[bytePosition] & (0xffffffff - (0x1 << (7 - bitPosition))));
} }
} }
public static byte[] WriteByte(this byte[] bytes, int position, byte value)
{
while (bytes.Length <= position)
{
byte[] newBytes = new byte[bytes.Length * 2];
Array.Copy(bytes, newBytes, bytes.Length);
bytes = newBytes;
}
bytes[position] = value;
return bytes;
}
} }
} }

View File

@@ -0,0 +1,197 @@
using System;
using System.Collections.Generic;
using System.Linq;
namespace VAR.UrlCompressor
{
class HuffmanTree
{
private List<HuffmanTreeNode> nodes = new List<HuffmanTreeNode>();
public HuffmanTreeNode Root { get; set; }
private Dictionary<char, int> _frequencies = new Dictionary<char, int>();
private const char EOD = (char)0xFFFF;
public HuffmanTree(Dictionary<char, int> frequencies)
{
_frequencies = frequencies;
_frequencies.Add(EOD, 1);
BuildTree();
}
public HuffmanTree(string source)
{
for (int i = 0; i < source.Length; i++)
{
if (!_frequencies.ContainsKey(source[i]))
{
_frequencies.Add(source[i], 0);
}
_frequencies[source[i]]++;
}
_frequencies.Add(EOD, 1);
BuildTree();
}
private void BuildTree()
{
foreach (KeyValuePair<char, int> symbol in _frequencies)
{
nodes.Add(new HuffmanTreeNode() { Symbol = symbol.Key, Frequency = symbol.Value });
}
while (nodes.Count > 1)
{
List<HuffmanTreeNode> orderedNodes = nodes.OrderBy(node => node.Frequency).ToList();
if (orderedNodes.Count >= 2)
{
HuffmanTreeNode parent = new HuffmanTreeNode()
{
Symbol = '*',
Frequency = orderedNodes[0].Frequency + orderedNodes[1].Frequency,
Left = orderedNodes[0],
Right = orderedNodes[1]
};
nodes.Remove(orderedNodes[0]);
nodes.Remove(orderedNodes[1]);
nodes.Add(parent);
}
Root = nodes.FirstOrDefault();
}
}
public byte[] Encode(byte[] data)
{
byte[] scratch = new byte[data.Length * 2];
int bitPosition = 0;
var encodedSymbol = new List<bool>();
for (int i = 0; i < data.Length; i++)
{
encodedSymbol.Clear();
encodedSymbol = Root.Traverse((char)data[i], encodedSymbol);
foreach(bool v in encodedSymbol)
{
scratch.WriteBit(bitPosition, 0, v);
bitPosition++;
}
}
encodedSymbol.Clear();
encodedSymbol = Root.Traverse(EOD, encodedSymbol);
foreach (bool v in encodedSymbol)
{
scratch.WriteBit(bitPosition, 0, v);
bitPosition++;
}
int byteLenght = (int)Math.Ceiling((double)bitPosition / 8);
byte[] compressedData = new byte[byteLenght];
Array.Copy(scratch, compressedData, byteLenght);
return compressedData;
}
public byte[] Decode(byte[] data)
{
HuffmanTreeNode current = Root;
byte[] scratch = new byte[data.Length];
int bitPosition = 0;
int bytePosition = 0;
int lenght = data.Length * 8;
while (bitPosition < lenght)
{
bool bit = data.ReadBit(bitPosition, 0);
bitPosition++;
if (bit)
{
if (current.Right != null)
{
current = current.Right;
}
}
else
{
if (current.Left != null)
{
current = current.Left;
}
}
if (current.IsLeaf())
{
if (current.Symbol == EOD) { break; }
scratch = scratch.WriteByte(bytePosition, (byte)current.Symbol);
bytePosition++;
current = Root;
}
}
byte[] decompressedData = new byte[bytePosition];
Array.Copy(scratch, decompressedData, bytePosition);
return decompressedData;
}
}
class HuffmanTreeNode
{
public char Symbol { get; set; }
public int Frequency { get; set; }
public HuffmanTreeNode Right { get; set; }
public HuffmanTreeNode Left { get; set; }
public bool IsLeaf()
{
return (Right == null && Left == null);
}
public List<bool> Traverse(char symbol, List<bool> data)
{
// Leaf
if (IsLeaf())
{
if (symbol == Symbol)
{
return data;
}
return null;
}
else
{
List<bool> left = null;
List<bool> right = null;
if (Left != null)
{
List<bool> leftPath = new List<bool>();
leftPath.AddRange(data);
leftPath.Add(false);
left = Left.Traverse(symbol, leftPath);
}
if (Right != null)
{
List<bool> rightPath = new List<bool>();
rightPath.AddRange(data);
rightPath.Add(true);
right = Right.Traverse(symbol, rightPath);
}
if (left != null)
{
return left;
}
else
{
return right;
}
}
}
}
}

View File

@@ -1,12 +1,127 @@
using System; using System.Collections.Generic;
using System.Text; using System.Text;
namespace VAR.UrlCompressor namespace VAR.UrlCompressor
{ {
public class UrlCompressor public class UrlCompressor
{ {
private static HuffmanTree _huffmanTree = null;
private static void InitHuffmanTree()
{
if (_huffmanTree != null) { return; }
var frequencies = new Dictionary<char, int>
{
// English frequencies (percetages*1000)
{ 'a', 8167},
{ 'b', 1492},
{ 'c', 2782},
{ 'd', 4253},
{ 'e', 12702},
{ 'f', 2228},
{ 'g', 2015},
{ 'h', 6094},
{ 'i', 6966},
{ 'j', 153},
{ 'k', 772},
{ 'l', 4025},
{ 'm', 2406},
{ 'n', 6749},
{ 'o', 7507},
{ 'p', 1929},
{ 'q', 95},
{ 'r', 5987},
{ 's', 6327},
{ 't', 9056},
{ 'u', 2758},
{ 'v', 978},
{ 'w', 2360},
{ 'x', 150},
{ 'y', 1974},
{ 'z', 74},
// English frequencies Upper case(percetages*1000)
{ 'A', 8167},
{ 'B', 1492},
{ 'C', 2782},
{ 'D', 4253},
{ 'E', 12702},
{ 'F', 2228},
{ 'G', 2015},
{ 'H', 6094},
{ 'I', 6966},
{ 'J', 153},
{ 'K', 772},
{ 'L', 4025},
{ 'M', 2406},
{ 'N', 6749},
{ 'O', 7507},
{ 'P', 1929},
{ 'Q', 95},
{ 'R', 5987},
{ 'S', 6327},
{ 'T', 9056},
{ 'U', 2758},
{ 'V', 978},
{ 'W', 2360},
{ 'X', 150},
{ 'Y', 1974},
{ 'Z', 74},
// Numbers, Use a fixed frequency of 1000.
{ '0', 1000},
{ '1', 1000},
{ '2', 1000},
{ '3', 1000},
{ '4', 1000},
{ '5', 1000},
{ '6', 1000},
{ '7', 1000},
{ '8', 1000},
{ '9', 1000},
// Common simbols
{ ' ', 100},
{ '!', 100},
{ '"', 100},
{ '#', 50000}, // NOTE: Exagerate to minimize bitstream of this symbol '#'
{ '$', 50000}, // NOTE: Exagerate to minimize bitstream of this symbol '$'
{ '%', 100},
{ '&', 100},
{ '\'', 100},
{ '(', 100},
{ '*', 100},
{ '+', 100},
{ ',', 100},
{ '-', 100},
{ '.', 100},
{ '/', 100},
{ ':', 100},
{ ';', 100},
{ '<', 100},
{ '=', 100},
{ '>', 100},
{ '?', 100},
{ '@', 100},
{ '[', 100},
{ '\\', 100},
{ ']', 100},
{ '^', 100},
{ '_', 100},
{ '`', 100},
{ '{', 100},
{ '|', 100},
{ '}', 100},
{ '~', 100},
};
_huffmanTree = new HuffmanTree(frequencies);
}
public static string Compress(string url) public static string Compress(string url)
{ {
InitHuffmanTree();
// Replace protocol indicator // Replace protocol indicator
if (url.StartsWith("https://") || url.StartsWith("HTTPS://")) if (url.StartsWith("https://") || url.StartsWith("HTTPS://"))
{ {
@@ -18,13 +133,21 @@ namespace VAR.UrlCompressor
} }
byte[] urlBytes = Encoding.ASCII.GetBytes(url); byte[] urlBytes = Encoding.ASCII.GetBytes(url);
return Base62.Encode(urlBytes);
byte[] compressedUrlBytes = _huffmanTree.Encode(urlBytes);
return Base62.Encode(compressedUrlBytes);
} }
public static string Decompress(string compressedUrl) public static string Decompress(string compressedUrl)
{ {
InitHuffmanTree();
byte[] urlBytes = Base62.Decode(compressedUrl); byte[] urlBytes = Base62.Decode(compressedUrl);
string url = Encoding.ASCII.GetString(urlBytes);
byte[] decompressedUrlBytes = _huffmanTree.Decode(urlBytes);
string url = Encoding.ASCII.GetString(decompressedUrlBytes);
// Restore protocol indicator // Restore protocol indicator
if (url.StartsWith("#")) if (url.StartsWith("#"))

View File

@@ -42,6 +42,7 @@
<ItemGroup> <ItemGroup>
<Compile Include="Base62.cs" /> <Compile Include="Base62.cs" />
<Compile Include="ByteExtensions.cs" /> <Compile Include="ByteExtensions.cs" />
<Compile Include="Huffman.cs" />
<Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="UrlCompressor.cs" /> <Compile Include="UrlCompressor.cs" />
</ItemGroup> </ItemGroup>