Page 1 of 1

Huffman encoding (compression)

Posted: Sat Dec 18, 2010 8:59 am
by Trond
bitwriter.pb (needed include file)

Code: Select all

Structure SBitWriterState
  File.i
  Buf.a
  Pos.a
EndStructure

Procedure FlushBitWriter(*State.SBitWriterState)
  WriteAsciiCharacter(*State\File, *State\Buf)
  *State\Buf = 0
  *State\Pos = 0
EndProcedure

Procedure WriteBit(*State.SBitWriterState, Bit.i)
  Mask = Bit << *State\Pos
  *State\Buf | Mask
  *State\Pos + 1
  If *State\Pos = 8
    FlushBitWriter(*State)
  EndIf
EndProcedure

Procedure WriteBits(*State.SBitWriterState, List Bits.i())
  ForEach Bits()
    WriteBit(*State, Bits())
  Next
EndProcedure

Procedure WriteUnalignedByte(*State.SBitWriterState, Value.a)
  For I = 0 To 7
    If (1 << I) & Value
      WriteBit(*State, 1)
    Else
      WriteBit(*State, 0)
    EndIf
  Next
EndProcedure

Procedure ReadBit(*State.SBitWriterState)
  If *State\Pos = 0
    If Not Eof(*State\File)
      *State\Buf = ReadAsciiCharacter(*state\File)
    Else
      ProcedureReturn -1
    EndIf
  EndIf
  If (1 << *State\Pos) & *State\Buf
    Bit = 1
  EndIf
  *State\Pos + 1
  If *State\Pos = 8
    *State\Pos = 0
  EndIf
  ProcedureReturn Bit
EndProcedure

Procedure ReadUnalignedByte(*State.SBitWriterState)
  For I = 0 To 7
    Value.a | (ReadBit(*State) << I)
  Next
  ProcedureReturn Value
EndProcedure
The actual code:

Code: Select all

IncludeFile "bitwriter.pb"

Structure SAsciiArray
  a.a[0]
EndStructure

Structure SFrequencies
  i.i[256]
EndStructure

Structure SHuffmanCode
  List Bits.i()
EndStructure

Structure SHuffmanCodeMap
  Bits.SHuffmanCode[256]
EndStructure

#SHuffmanNode_Leaf = 1
#SHuffmanNode_Internal = 2

Structure SHuffmanNode
  Kind.i
  Frequency.i
  ; Leaf attributes
  Value.a
  ; Internal node attributes
  *Left.SHuffmanNode  ; 0
  *Right.SHuffmanNode ; 1
EndStructure

Procedure NewHuffmanLeaf()
  *N.SHuffmanNode = AllocateMemory(SizeOf(SHuffmanNode))
  *N\Kind = #SHuffmanNode_Leaf
  ProcedureReturn *N
EndProcedure

Procedure NewHuffmanInternal()
  *N.SHuffmanNode = AllocateMemory(SizeOf(SHuffmanNode))
  *N\Kind = #SHuffmanNode_Internal
  ProcedureReturn *N
EndProcedure

Procedure CreateCodeMap(*Tree.SHuffmanNode, *CodeMap.SHuffmanCodeMap, List Bits.i())
  Select *Tree\Kind
    Case #SHuffmanNode_Internal
      AddElement(Bits())
      CreateCodeMap(*Tree\Left, *CodeMap, Bits())
      Bits() = 1
      CreateCodeMap(*Tree\Right, *CodeMap, Bits())
      DeleteElement(Bits())
    Case #SHuffmanNode_Leaf
      CopyList(Bits(), *CodeMap\Bits[*Tree\Value]\Bits())
      LastElement(Bits())
  EndSelect
EndProcedure

Procedure WriteHuffmanData(*CodeMap.SHuffmanCodeMap, *Mem.SAsciiArray, MemLen, *Output.SBitWriterState)
  For I = 0 To MemLen
    WriteBits(*Output, *CodeMap\Bits[*Mem\a[i]]\Bits())
  Next
EndProcedure

Procedure WriteHuffmanTree(*Tree.SHuffmanNode, *Output.SBitWriterState)
  Select *Tree\Kind
    Case #SHuffmanNode_Internal
      WriteBit(*Output, 0)
      WriteHuffmanTree(*Tree\Left, *Output)
      WriteHuffmanTree(*Tree\Right, *Output)
    Case #SHuffmanNode_Leaf
      WriteBit(*Output, 1)
      WriteUnalignedByte(*Output, *Tree\Value)
  EndSelect
EndProcedure

Procedure ReadHuffmanTree(*Input.SBitWriterState)
  *Node.SHuffmanNode = NewHuffmanInternal()
  Bit = ReadBit(*Input)
  If Bit = 1
    *Node\Kind = #SHuffmanNode_Leaf
    *Node\Value = ReadUnalignedByte(*Input)
  Else
    *Node\Left = ReadHuffmanTree(*Input)
    *Node\Right = ReadHuffmanTree(*Input)
  EndIf
  ProcedureReturn *Node
EndProcedure

Procedure DecodeHuffmanValue(*Tree.SHuffmanNode, *Input.SBitWriterState)
  If *Tree\Kind = #SHuffmanNode_Leaf
    ProcedureReturn *Tree\Value
  EndIf
  Bit = ReadBit(*Input)
  Select Bit
    Case 0
      ProcedureReturn DecodeHuffmanValue(*Tree\Left, *Input)
    Case 1
      ProcedureReturn DecodeHuffmanValue(*Tree\Right, *Input)
    Case -1
      CallDebugger
      ProcedureReturn -1
  EndSelect
EndProcedure

Procedure CreateHuffmanTree(*Freq.SFrequencies)
  Protected *Root.SHuffmanNode
  ; Create a leaf node for each symbol
  NewList Queue.SHuffmanNode()
  For I = 0 To 255
    If *Freq\i[i]
      AddElement(Queue())
      Queue()\Kind = #SHuffmanNode_Leaf
      Queue()\Frequency = *Freq\i[i]
      Queue()\Value = i
    EndIf
  Next
  
  ; Generate huffman tree
  While ListSize(Queue()) >= 2
    SortStructuredList(Queue(), #PB_Sort_Ascending, OffsetOf(SHuffmanNode\Frequency), #PB_Sort_Integer)
    FirstElement(Queue())
    
    NewNode.SHuffmanNode
    NewNode\Kind = #SHuffmanNode_Internal
    NewNode\Left = NewHuffmanLeaf()
    CopyStructure(@Queue(), NewNode\Left, SHuffmanNode)
    DeleteElement(Queue(), 1)
    NewNode\Right = NewHuffmanLeaf()
    CopyStructure(@Queue(), NewNode\Right, SHuffmanNode)
    NewNode\Frequency = NewNode\Left\Frequency + NewNode\Right\Frequency
    Queue() = NewNode
  Wend
  
  *Root = NewHuffmanInternal()
  CopyStructure(@Queue(), *Root, SHuffmanNode)
  ClearList(Queue())
  ProcedureReturn *Root
EndProcedure

; Encode from memory to file
Procedure HuffEncode(*Mem.SAsciiArray, ByteCount, Filename.s)
  Protected Freq.SFrequencies
  Protected Max = ByteCount-1
  Protected WriteState.SBitWriterState
  WriteState\File = CreateFile(#PB_Any, Filename)
  If WriteState\File = 0
    ProcedureReturn 0
  EndIf
  
  ; Count frequencies
  For I = 0 To Max
    Freq\i[*Mem\a[i]] + 1
  Next
  
  ; Create huffman tree
  *Root = CreateHuffmanTree(Freq)
  
  ; Save byte length of uncompressed data
  WriteQuad(WriteState\File, ByteCount)
  
  ; Save huffman tree
  WriteHuffmanTree(*Root, WriteState)
  
  ; Create code map
  Protected CodeMap.SHuffmanCodeMap
  NewList Bits.i()
  CreateCodeMap(*Root, CodeMap, Bits())
  
  ; Save huffman encoded data
  WriteHuffmanData(CodeMap, *Mem, ByteCount, WriteState)
  FlushBitWriter(WriteState)
  CloseFile(WriteState\File)
  ProcedureReturn 1
EndProcedure

; Decode from file to memory. Usage:
; ByteCount = HuffDecode("filename", @*MyPtr)
Procedure HuffDecode(Filename.s, *pPtr.Integer)
  Protected State.SBitWriterState
  Protected *Mem.SAsciiArray
  Protected ByteCount
  State\File = ReadFile(#PB_Any, Filename)
  If state\File = 0
    ProcedureReturn 0
  EndIf
  ; Read byte length of uncompressed data
  ByteCount = ReadQuad(State\File)
  ; Read huffman tree
  *Tree = ReadHuffmanTree(State)
  ; Decode file
  *Mem = AllocateMemory(ByteCount+2)
  Max = ByteCount - 1
  For I = 0 To Max
    *Mem\a[i] = DecodeHuffmanValue(*Tree, State)
  Next
  *pPtr\i = *Mem
  ProcedureReturn ByteCount
EndProcedure

; Encode from file to file
Procedure HuffEncodeFromFile(Infile.s, Outfile.s)
  Protected In
  In = ReadFile(#PB_Any, Infile)
  If In
    L = Lof(In)
    Max = L-1
    *Mem.SAsciiArray = AllocateMemory(L)
    ReadData(In, *Mem, L)
    CloseFile(In)
    a = HuffEncode(*Mem, L, Outfile)
    FreeMemory(*Mem)
    ProcedureReturn a
  EndIf
  ProcedureReturn 0
EndProcedure

;- EXAMPLE:
Original.s = "/home/trond/asdfasdf.svg"
Compressed.s = "/home/trond/asdfasdf.avg.hz"
Uncompressed.s = "/home/trond/asdfasdf_2.svg"

; Compress
HuffEncodeFromFile(Original, Compressed)

; Decompress
Length = HuffDecode(Compressed, @*Mem)
OpenFile(0, Uncompressed)
WriteData(0, *Mem, Length)
CloseFile(0)

; Show compression ratio
OrigSize.i = FileSize(Original)
CompressedSize.i = FileSize(Compressed)
Percent.d = CompressedSize/OrigSize*100
Debug "Compressed to " + StrD(Percent, 1) + "% of original"

Re: Huffman encoding (compression)

Posted: Sat Dec 18, 2010 6:42 pm
by Joakim Christiansen
Interesting!

Could you tell us advantages and disadvantages of the Huffman encoding?

Re: Huffman encoding (compression)

Posted: Sat Dec 18, 2010 7:53 pm
by Trond
The disadvantage is that arithmetic encoding compresses better.
The advantage is that I understood how to implement it!

Also it is an advantage over the built-in compression of PB that the generated files are usable on both 32-bit and 64-bit systems (I didn't test it on PowerPC, though).

Huffman encoding is an interesting algorithm because it doesn't use a dictionary (like other compression like zip does).

Re: Huffman encoding (compression)

Posted: Sat Dec 18, 2010 10:30 pm
by idle
Thanks, very interesting.

Re: Huffman encoding (compression)

Posted: Sat Dec 18, 2010 11:03 pm
by Thorium
Huffman is especialy effective for compressing text. And it's part of a lot of major compressions. For example you can do first a run length encoding than do a dictionary encoding and than a entropy encoding (huffman). That will get you a very good compression.

Re: Huffman encoding (compression)

Posted: Sun Dec 19, 2010 2:37 am
by IdeasVacuum
Thank you for sharing the code Trond. It is pretty good at compressing text files - tested a 20174 line (1982331 char) 1.88MB file:

WinXP 32bit

Huffman: 359ms compress, 375ms decompress, 867KB.

PB's Pack functions:

Compression value (0): 344ms pack, 94 unpack, 404KB
Compression value (5): 750ms pack, 94 unpack, 104KB
Compression value (9): 1156ms pack, 156 unpack, 95KB