Fast Hashing suitable for already very few elements

Share your advanced PureBasic knowledge/code with the community.
Froggerprogger
Enthusiast
Enthusiast
Posts: 423
Joined: Fri Apr 25, 2003 5:22 pm
Contact:

Fast Hashing suitable for already very few elements

Post by Froggerprogger »

This code is a result of this thread:

http://www.purebasic.fr/english/viewtop ... 987#259987

The linked thread contains some more details and background-information.

The problem is the following scenario:
a) There is a couple of servers identified by an ID. Maximal #N of them are active at any time
b) While a server is active, some data for it can be accessed often at random times
c) An active server might become unactivated, and then its slot in the underlying datastructure is freed for a new server

This can be interpretated more general:
We want to store up to #N elements identified by an ID, that contain some data. They can be added or deleted at any time. While they are added we want to access their data identified by the element's ID very fast at any time.
Therefore asymptotically a hashtable would be optimal, but for small #N (e.g. 32) one would usually think, that the overhead would slow it down, so a simple small array with sequential search is much faster. But the following example shows, that by using a simple and fast hashfunction already the hasharray is much faster:

Code: Select all

EnableExplicit

#N = 256 ; the maximal number of elements, must be a power of 2
#Nmask = #N-1

Structure Element
  id.l ; e.g. the socket-id
  active.l ; if 0, then this element might be overwritten
  elemData.l ; some arbitrary data
EndStructure

Define *tempElement.Element
Define i.l

;
; CycleArray
;

Dim CycleArray.Element(#N) ; elements at indices 0..#N-1 are used
Define ca_curAddIndex.l = 0
Define ca_curFindIndex.l = 0

Macro IncreaseCircular(var)
  var = (var + 1) & #Nmask
EndMacro

Macro DecreaseCircular(var)
  var = (var - 1) & #Nmask
EndMacro

Macro CA_AddElement(m_id, m_elemData)
  ; ASSUMPTION: there will never be #N active elements at a time, otherwise danger of endless loop
  While CycleArray(ca_curAddIndex)\active
    IncreaseCircular(ca_curAddIndex)
  Wend
  CycleArray(ca_curAddIndex)\id = m_id
  CycleArray(ca_curAddIndex)\active = 1
  CycleArray(ca_curAddIndex)\elemData = m_elemData
EndMacro

Macro CA_FindElement(m_id, m_p_elem)
  ; ASSUMPTION: the m_id to find IS somewhere in the array, otherwise danger of endless loop
  ca_curFindIndex  = ca_curAddIndex
  While CycleArray(ca_curFindIndex)\id <> m_id
    ; lets search backwards, because higher probability to find in most recent entries
    DecreaseCircular(ca_curFindIndex)
  Wend
  ; let _p_elem point to the found element
  m_p_elem = @CycleArray(ca_curFindIndex)
EndMacro

Macro CA_RemoveElement(m_id)
  ; ASSUMPTION: the m_id to remove IS somewhere in the array, otherwise danger of endless loop
  CA_FindElement(m_id, *tempElement)
  *tempElement\active = 0
EndMacro

Macro CA_DebugElements()
  For i=0 To #N-1
    Debug "--------------------"
    Debug i
    Debug CycleArray(i)\id
    Debug CycleArray(i)\active
    Debug CycleArray(i)\elemData
  Next
EndMacro

;
; HashArray
;

Dim HashArray.Element(#N) ; elements at indices 0..#N-1 are used
Define ha_tempIndex.l

Macro StartHashIndex(var, key)
  var = (key) & #Nmask
EndMacro

Macro IncreaseHashIndex(var)
  var = (var + 123454339) & #Nmask ; 123454339 might be any big prime
EndMacro

Macro HA_AddElement(m_id, m_elemData)
  ; ASSUMPTION: there will never be #N active elements at a time, otherwise danger of endless loop
  StartHashIndex(ha_tempIndex, m_id)
  While HashArray(ha_tempIndex)\active
    IncreaseHashIndex(ha_tempIndex)
  Wend
  HashArray(ha_tempIndex)\id = m_id
  HashArray(ha_tempIndex)\active = 1
  HashArray(ha_tempIndex)\elemData = m_elemData
EndMacro

Macro HA_FindElement(m_id, m_p_elem)
  ; ASSUMPTION: the m_id to find IS somewhere in the array, otherwise danger of endless loop
  StartHashIndex(ha_tempIndex, m_id)
  While HashArray(ha_tempIndex)\id <> m_id
    IncreaseHashIndex(ha_tempIndex)
  Wend
  ; let _p_elem point to the found element
  m_p_elem = @HashArray(ha_tempIndex)
EndMacro

Macro HA_RemoveElement(m_id)
  ; ASSUMPTION: the m_id to remove IS somewhere in the array, otherwise danger of endless loop
  HA_FindElement(m_id, *tempElement)
  *tempElement\active = 0
EndMacro

Macro HA_DebugElements()
  For i=0 To #N-1

    Debug "--------------------"
    Debug i

    Debug HashArray(i)\id
    Debug HashArray(i)\active
    Debug HashArray(i)\elemData
  Next
EndMacro



;########################################
; SPEEDING-TEST
;########################################

;
; testsuite: creates some actions and works through them once by CA_* and once by HA_*
;
#RandomSeed = 123456789
RandomSeed(#RandomSeed)
 
Structure Action
  type.l ; 1=add, 2=modify data, 3=remove
  id.l
  newData.l ; only for type 1 and 2
EndStructure

Macro CA_HandleActions()
  Define ca_time.l = ElapsedMilliseconds()
  Define id.l
  Define elemData.l
  Define *elem.Element
  For i=0 To #ActionsArraySize-1
    Debug "CA_HandleActions: " + Str(Actions(i)\type) + " " +  Str(Actions(i)\id)
    Select Actions(i)\type
      Case 1 : id = Actions(i)\id
                elemData = Actions(i)\newData
                CA_AddElement(id, elemData)

      Case 2 : id = Actions(i)\id
                CA_FindElement(id, *elem)
                *elem\elemData = Actions(i)\newData

      Case 3 : id = Actions(i)\id
                CA_RemoveElement(id)

    EndSelect
  Next
  ca_time = ElapsedMilliseconds() - ca_time
EndMacro

Macro HA_HandleActions()
  Define ha_time.l = ElapsedMilliseconds()
  Define id.l
  Define elemData.l
  Define *elem.Element
  For i=0 To #ActionsArraySize-1
    Debug "HA_HandleActions: " + Str(Actions(i)\type) + " " +  Str(Actions(i)\id)
    Select Actions(i)\type
      Case 1 : id = Actions(i)\id
                elemData = Actions(i)\newData
                HA_AddElement(id, elemData)

      Case 2 : id = Actions(i)\id
                HA_FindElement(id, *elem)
                *elem\elemData = Actions(i)\newData

      Case 3 : id = Actions(i)\id
                HA_RemoveElement(id)

    EndSelect
  Next
  ha_time = ElapsedMilliseconds() - ha_time
EndMacro


;
; EXAMPLE: Access-speed to #N-2 servers
;
#NServer = (#N-2)
#NAccesses = 1000000
#ActionsArraySize = 2*#NServer+#NAccesses

Macro ServerID(m_i)
  m_i
  ;1234567*(m_i)+7654321 ; use this more pseudo-random instead of m_i shows no big difference in speedings
  ;m_i*#N ; this is the worst-case in theory. All servers are initially mapped to the same entry
  ;m_i*#N/2 ; still a very bad case
  ;100000+123*m_i  ; in this case the offset (here 123) should be no power of 2
EndMacro

;
; create the actions
;
Dim Actions.Action(#ActionsArraySize)

; add server
For i=0 To #NServer-1
  Actions(i)\type = 1
  Actions(i)\id = ServerID(i) ; a unique id
  Actions(i)\newData = Random(9999)
Next

; access data
For i=#NServer To #NServer + #NAccesses - 1
  Actions(i)\type = 2
  Actions(i)\id = ServerID(Random(#NServer-1))
  Actions(i)\newData = Random(9999)
Next

; remove server
Define count.l = 0
For i=#NServer+#NAccesses To 2*#NServer + #NAccesses - 1
  Actions(i)\type = 3
  Actions(i)\id = ServerID(count) ; the previously added ids
  count + 1
Next

;
; start test
;

DisableDebugger
CA_HandleActions()
EnableDebugger
;CA_DebugElements() ; show the internals of CycleArray

DisableDebugger
HA_HandleActions()
EnableDebugger
;HA_DebugElements() ; show the internals of HashArray

Debug "Timing for CA: " + Str(ca_time)
Debug "Timing for HA: " + Str(ha_time) 
So this approach is faster, as long as the hashfunction can be kept simple, which depends on the distribution of the IDs used as keys. And already a simple incrementing ID-generator works very well, as well as a random generator and most others. Only some special (though not very exotic) ID-combinations will slow down the implementation and would need a more complex (and slower) hashfunction.

The other (above linked) thread might contain more details on that.
%1>>1+1*1/1-1!1|1&1<<$1=1