From 044f0263654420219bd1d59d1fcdc5b46326e78a Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Sun, 9 Jul 2017 23:24:24 +0200 Subject: [PATCH] Use Huffman compression with a fixed tree to lower data usage. --- VAR.UrlCompressor/ByteExtensions.cs | 16 +- VAR.UrlCompressor/Huffman.cs | 197 +++++++++++++++++++++ VAR.UrlCompressor/UrlCompressor.cs | 129 +++++++++++++- VAR.UrlCompressor/VAR.UrlCompressor.csproj | 1 + 4 files changed, 339 insertions(+), 4 deletions(-) create mode 100644 VAR.UrlCompressor/Huffman.cs diff --git a/VAR.UrlCompressor/ByteExtensions.cs b/VAR.UrlCompressor/ByteExtensions.cs index 917160e..054a290 100644 --- a/VAR.UrlCompressor/ByteExtensions.cs +++ b/VAR.UrlCompressor/ByteExtensions.cs @@ -1,4 +1,6 @@ -namespace VAR.UrlCompressor +using System; + +namespace VAR.UrlCompressor { static class ByteExtensions { @@ -28,5 +30,17 @@ bytes[bytePosition] = (byte)(bytes[bytePosition] & (0xffffffff - (0x1 << (7 - bitPosition)))); } } + + public static byte[] WriteByte(this byte[] bytes, int position, byte value) + { + while (bytes.Length <= position) + { + byte[] newBytes = new byte[bytes.Length * 2]; + Array.Copy(bytes, newBytes, bytes.Length); + bytes = newBytes; + } + bytes[position] = value; + return bytes; + } } } diff --git a/VAR.UrlCompressor/Huffman.cs b/VAR.UrlCompressor/Huffman.cs new file mode 100644 index 0000000..1fa37e3 --- /dev/null +++ b/VAR.UrlCompressor/Huffman.cs @@ -0,0 +1,197 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace VAR.UrlCompressor +{ + class HuffmanTree + { + private List nodes = new List(); + public HuffmanTreeNode Root { get; set; } + private Dictionary _frequencies = new Dictionary(); + + private const char EOD = (char)0xFFFF; + + public HuffmanTree(Dictionary frequencies) + { + _frequencies = frequencies; + _frequencies.Add(EOD, 1); + BuildTree(); + } + + public HuffmanTree(string source) + { + for (int i = 0; i < source.Length; i++) + { + if (!_frequencies.ContainsKey(source[i])) + { + _frequencies.Add(source[i], 0); + } + + _frequencies[source[i]]++; + } + _frequencies.Add(EOD, 1); + + BuildTree(); + } + + private void BuildTree() + { + foreach (KeyValuePair symbol in _frequencies) + { + nodes.Add(new HuffmanTreeNode() { Symbol = symbol.Key, Frequency = symbol.Value }); + } + + while (nodes.Count > 1) + { + List orderedNodes = nodes.OrderBy(node => node.Frequency).ToList(); + + if (orderedNodes.Count >= 2) + { + HuffmanTreeNode parent = new HuffmanTreeNode() + { + Symbol = '*', + Frequency = orderedNodes[0].Frequency + orderedNodes[1].Frequency, + Left = orderedNodes[0], + Right = orderedNodes[1] + }; + + nodes.Remove(orderedNodes[0]); + nodes.Remove(orderedNodes[1]); + nodes.Add(parent); + } + + Root = nodes.FirstOrDefault(); + } + } + + public byte[] Encode(byte[] data) + { + byte[] scratch = new byte[data.Length * 2]; + int bitPosition = 0; + var encodedSymbol = new List(); + + for (int i = 0; i < data.Length; i++) + { + encodedSymbol.Clear(); + encodedSymbol = Root.Traverse((char)data[i], encodedSymbol); + foreach(bool v in encodedSymbol) + { + scratch.WriteBit(bitPosition, 0, v); + bitPosition++; + } + } + encodedSymbol.Clear(); + encodedSymbol = Root.Traverse(EOD, encodedSymbol); + foreach (bool v in encodedSymbol) + { + scratch.WriteBit(bitPosition, 0, v); + bitPosition++; + } + int byteLenght = (int)Math.Ceiling((double)bitPosition / 8); + byte[] compressedData = new byte[byteLenght]; + Array.Copy(scratch, compressedData, byteLenght); + + return compressedData; + } + + public byte[] Decode(byte[] data) + { + HuffmanTreeNode current = Root; + byte[] scratch = new byte[data.Length]; + int bitPosition = 0; + int bytePosition = 0; + + int lenght = data.Length * 8; + while (bitPosition < lenght) + { + bool bit = data.ReadBit(bitPosition, 0); + bitPosition++; + if (bit) + { + if (current.Right != null) + { + current = current.Right; + } + } + else + { + if (current.Left != null) + { + current = current.Left; + } + } + + if (current.IsLeaf()) + { + if (current.Symbol == EOD) { break; } + scratch = scratch.WriteByte(bytePosition, (byte)current.Symbol); + bytePosition++; + current = Root; + } + } + + byte[] decompressedData = new byte[bytePosition]; + Array.Copy(scratch, decompressedData, bytePosition); + + return decompressedData; + } + } + + class HuffmanTreeNode + { + public char Symbol { get; set; } + public int Frequency { get; set; } + public HuffmanTreeNode Right { get; set; } + public HuffmanTreeNode Left { get; set; } + + public bool IsLeaf() + { + return (Right == null && Left == null); + } + + public List Traverse(char symbol, List data) + { + // Leaf + if (IsLeaf()) + { + if (symbol == Symbol) + { + return data; + } + return null; + } + else + { + List left = null; + List right = null; + + if (Left != null) + { + List leftPath = new List(); + leftPath.AddRange(data); + leftPath.Add(false); + + left = Left.Traverse(symbol, leftPath); + } + + if (Right != null) + { + List rightPath = new List(); + rightPath.AddRange(data); + rightPath.Add(true); + right = Right.Traverse(symbol, rightPath); + } + + if (left != null) + { + return left; + } + else + { + return right; + } + } + } + } +} diff --git a/VAR.UrlCompressor/UrlCompressor.cs b/VAR.UrlCompressor/UrlCompressor.cs index 08d4305..80a288a 100644 --- a/VAR.UrlCompressor/UrlCompressor.cs +++ b/VAR.UrlCompressor/UrlCompressor.cs @@ -1,12 +1,127 @@ -using System; +using System.Collections.Generic; using System.Text; namespace VAR.UrlCompressor { public class UrlCompressor { + private static HuffmanTree _huffmanTree = null; + + private static void InitHuffmanTree() + { + if (_huffmanTree != null) { return; } + + var frequencies = new Dictionary + { + // English frequencies (percetages*1000) + { 'a', 8167}, + { 'b', 1492}, + { 'c', 2782}, + { 'd', 4253}, + { 'e', 12702}, + { 'f', 2228}, + { 'g', 2015}, + { 'h', 6094}, + { 'i', 6966}, + { 'j', 153}, + { 'k', 772}, + { 'l', 4025}, + { 'm', 2406}, + { 'n', 6749}, + { 'o', 7507}, + { 'p', 1929}, + { 'q', 95}, + { 'r', 5987}, + { 's', 6327}, + { 't', 9056}, + { 'u', 2758}, + { 'v', 978}, + { 'w', 2360}, + { 'x', 150}, + { 'y', 1974}, + { 'z', 74}, + + // English frequencies Upper case(percetages*1000) + { 'A', 8167}, + { 'B', 1492}, + { 'C', 2782}, + { 'D', 4253}, + { 'E', 12702}, + { 'F', 2228}, + { 'G', 2015}, + { 'H', 6094}, + { 'I', 6966}, + { 'J', 153}, + { 'K', 772}, + { 'L', 4025}, + { 'M', 2406}, + { 'N', 6749}, + { 'O', 7507}, + { 'P', 1929}, + { 'Q', 95}, + { 'R', 5987}, + { 'S', 6327}, + { 'T', 9056}, + { 'U', 2758}, + { 'V', 978}, + { 'W', 2360}, + { 'X', 150}, + { 'Y', 1974}, + { 'Z', 74}, + + // Numbers, Use a fixed frequency of 1000. + { '0', 1000}, + { '1', 1000}, + { '2', 1000}, + { '3', 1000}, + { '4', 1000}, + { '5', 1000}, + { '6', 1000}, + { '7', 1000}, + { '8', 1000}, + { '9', 1000}, + + // Common simbols + { ' ', 100}, + { '!', 100}, + { '"', 100}, + { '#', 50000}, // NOTE: Exagerate to minimize bitstream of this symbol '#' + { '$', 50000}, // NOTE: Exagerate to minimize bitstream of this symbol '$' + { '%', 100}, + { '&', 100}, + { '\'', 100}, + { '(', 100}, + { '*', 100}, + { '+', 100}, + { ',', 100}, + { '-', 100}, + { '.', 100}, + { '/', 100}, + { ':', 100}, + { ';', 100}, + { '<', 100}, + { '=', 100}, + { '>', 100}, + { '?', 100}, + { '@', 100}, + { '[', 100}, + { '\\', 100}, + { ']', 100}, + { '^', 100}, + { '_', 100}, + { '`', 100}, + { '{', 100}, + { '|', 100}, + { '}', 100}, + { '~', 100}, + }; + _huffmanTree = new HuffmanTree(frequencies); + } + public static string Compress(string url) { + InitHuffmanTree(); + // Replace protocol indicator if (url.StartsWith("https://") || url.StartsWith("HTTPS://")) { @@ -18,13 +133,21 @@ namespace VAR.UrlCompressor } byte[] urlBytes = Encoding.ASCII.GetBytes(url); - return Base62.Encode(urlBytes); + + byte[] compressedUrlBytes = _huffmanTree.Encode(urlBytes); + + return Base62.Encode(compressedUrlBytes); } public static string Decompress(string compressedUrl) { + InitHuffmanTree(); + byte[] urlBytes = Base62.Decode(compressedUrl); - string url = Encoding.ASCII.GetString(urlBytes); + + byte[] decompressedUrlBytes = _huffmanTree.Decode(urlBytes); + + string url = Encoding.ASCII.GetString(decompressedUrlBytes); // Restore protocol indicator if (url.StartsWith("#")) diff --git a/VAR.UrlCompressor/VAR.UrlCompressor.csproj b/VAR.UrlCompressor/VAR.UrlCompressor.csproj index fef7b68..cadc461 100644 --- a/VAR.UrlCompressor/VAR.UrlCompressor.csproj +++ b/VAR.UrlCompressor/VAR.UrlCompressor.csproj @@ -42,6 +42,7 @@ +