Population count

Bare metal programming in PureBasic, for experienced users
Little John
Addict
Addict
Posts: 4519
Joined: Thu Jun 07, 2007 3:25 pm
Location: Berlin, Germany

Population count

Post by Little John »

Hi all,

I want to count the number of bits that are set to 1 in a 32 bit variable.

The following code seems to work correctly on Windows, with PB 5.31 x86 and x64.
Are there any mistakes in the code, or is there any other room for improvement?
Thanks in advance!

Code: Select all

#HasSSE42 = #False  ; Does the processor support the SSE 4.2 instruction set?
                    ; Can be checked with idle's fine CPUInfo module:
                    ; http://www.purebasic.fr/english/viewtopic.php?f=12&t=61060


Procedure.l PopCount (k.l)
   ; -- Population count:
   ; get number of bits that are set to 1 in 'k'
   
   CompilerIf #HasSSE42
      ! mov    eax, [p.v_k]
      ! popcnt eax, eax
      
   CompilerElse   
      ! mov  edx, [p.v_k]
      ! xor  eax, eax
      ! mov  ecx, 32
      
      ! popcount_again:  
      ! dec  ecx
      ! bt   edx, ecx
      ! jnc  popcount_notset
      ! inc  eax
      ! popcount_notset:   
      ! or   ecx, ecx
      ! jnz  popcount_again
   CompilerEndIf   
   
   ProcedureReturn           ; return the value of eax
EndProcedure


Debug PopCount(%0)                                 ;  0
Debug PopCount(%11)                                ;  2
Debug PopCount(%10000000000000000000000000000000)  ;  1
Debug PopCount(%11111111111111111111111111111111)  ; 32
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: Population count

Post by wilbert »

Here's a second method (Brian Kernighan's way)
Especially faster when not all bits are set.

Code: Select all

 Procedure.l PopCount_m2(l.l)
  !xor eax, eax
  !mov edx, [p.v_l]
  !and edx, edx
  !jz popcount_m2_done
  !popcount_m2_loop:
  !inc eax
  !lea ecx, [edx - 1]
  !and edx, ecx
  !jnz popcount_m2_loop
  !popcount_m2_done:
  ProcedureReturn
EndProcedure
Windows (x64)
Raspberry Pi OS (Arm64)
User avatar
luis
Addict
Addict
Posts: 3876
Joined: Wed Aug 31, 2005 11:09 pm
Location: Italy

Re: Population count

Post by luis »

I don't remember much about ASM, but I would simplify the second part like this by using the carry:

Code: Select all

 !mov edx, [p.v_k] 
 !xor eax, eax
 !mov ecx, 32 
  
 !popcount:  
 !shl edx, 1
 !adc eax, 0
 !loop popcount
If you have something against loop (I don't) you can replace it with

Code: Select all

 
 !dec ecx
 !jnz popcount 
"Have you tried turning it off and on again ?"
A little PureBasic review
Little John
Addict
Addict
Posts: 4519
Joined: Thu Jun 07, 2007 3:25 pm
Location: Berlin, Germany

Re: Population count

Post by Little John »

Cool. 8) Many thanks, wilbert and luis!

These are typical results that I get for speed testing (PB 5.31 x86):

Code: Select all

---------------------------
PopCount speed test
---------------------------
t0 =   29 ms   (11%)  ; 'popcnt' asm instruction
t1 =  259 ms  (100%)  ; wilbert
t2 =  334 ms  (128%)  ; luis
t3 =  737 ms  (284%)  ; luis (using 'loop')
t4 = 1316 ms  (508%)  ; LJ
t5 = 1240 ms  (478%)  ; Trond (recursive)
t6 =  854 ms  (329%)  ; Trond
t7 =   40 ms   (15%)  ; idle
---------------------------
For the record, here is the code that I used for testing. Thanks again!

Code: Select all

; PB 5.31

EnableExplicit

Procedure.l PopCount0 (k.l)
   ! mov    eax, [p.v_k]
   ! popcnt eax, eax
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount1 (k.l)
   ; -- wilbert
   
   !xor eax, eax
   !mov edx, [p.v_k]
   !and edx, edx
   !jz popcount_m2_done
   
   !popcount_m2_loop:
   !inc eax
   !lea ecx, [edx - 1]
   !and edx, ecx
   !jnz popcount_m2_loop
   
   !popcount_m2_done:
   ProcedureReturn
EndProcedure


Procedure.l PopCount2 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount2:
   !shl edx, 1
   !adc eax, 0
   !dec ecx
   !jnz popcount2
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount3 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount3:
   !shl edx, 1
   !adc eax, 0
   !loop popcount3
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount4 (k.l)
   ; -- LJ
   
   ! mov  edx, [p.v_k]
   ! xor  eax, eax
   ! mov  ecx, 32
   
   ! popcount_again:
   ! dec  ecx
   ! bt   edx, ecx
   ! jnc  popcount_notset
   ! inc  eax
   ! popcount_notset:   
   ! or   ecx, ecx
   ! jnz  popcount_again
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount5 (v.l, i = 0)
   ; -- Trond
   
   If i < 32
      ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
   EndIf
EndProcedure


Procedure.l PopCount6 (v.l)
   ; -- Trond
   
   Protected i.i, Agg.l
   
   For i = 0 To 31
      Agg + (v & 1 << i) >> i
   Next
   ProcedureReturn Agg
EndProcedure


Procedure.l Popcount7 (x.l)
   ; -- idle (adapted for 32 bit)
   
   ;x - (x >> 1) &  $55555555           
   !mov eax, [p.v_x]
   !mov edx, eax
   !shr edx, 1
   !and edx, 0x55555555
   !sub eax, edx
   ;x = (x & $33333333) + ((x >> 2) & $33333333)
   !mov edx, eax       ;x
   !and eax, 0x33333333
   !shr edx, 2
   !and edx, 0x33333333
   !add eax, edx
   ;x = (x + (x >> 4)) & $0f0f0f0f0f
   !mov edx, eax
   !shr edx, 4
   !add eax, edx
   !and eax, 0x0f0f0f0f
   ;x * 0x01010101 >> 24
   !imul eax, 0x01010101
   !shr eax, 24
   ProcedureReturn 
EndProcedure 


; ---- Initialisation ----
Define.i i, x, rep=10000000
Define.i t0, t1, t2, t3, t4, t5, t6, t7

Dim Rnd.l(rep)
For i = 1 To rep
   Rnd(i) = Random(2147483647)
Next


; ---- Small check whether all procedures return the same results ----
For i = 1 To 100
   x = PopCount0(Rnd(i))
   If x <> PopCount1(Rnd(i)) Or
      x <> PopCount2(Rnd(i)) Or
      x <> PopCount3(Rnd(i)) Or
      x <> PopCount4(Rnd(i)) Or
      x <> PopCount5(Rnd(i)) Or
      x <> PopCount6(Rnd(i)) Or
      x <> PopCount7(Rnd(i))
      MessageRequester("Error",
                       "Different results for PopCount(" + Rnd(i) + ")")
      End
   EndIf
Next   


; ---- Speed test ----
t0 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount0(Rnd(i))
Next   
t0 = ElapsedMilliseconds() - t0

t1 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount1(Rnd(i))
Next   
t1 = ElapsedMilliseconds() - t1

t2 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount2(Rnd(i))
Next   
t2 = ElapsedMilliseconds() - t2

t3 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount3(Rnd(i))
Next   
t3 = ElapsedMilliseconds() - t3

t4 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount4(Rnd(i))
Next   
t4 = ElapsedMilliseconds() - t4

t5 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount5(Rnd(i))
Next   
t5 = ElapsedMilliseconds() - t5

t6 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount6(Rnd(i))
Next   
t6 = ElapsedMilliseconds() - t6

t7 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount7(Rnd(i))
Next   
t7 = ElapsedMilliseconds() - t7


MessageRequester("PopCount speed test",
                 "t0 = " + t0 + " ms     (" + Int(100*t0/t1) + "%)" + #LF$ +
                 "t1 = " + t1 + " ms   (100%)" + #LF$ +
                 "t2 = " + t2 + " ms   (" + Int(100*t2/t1) + "%)" + #LF$ +
                 "t3 = " + t3 + " ms   (" + Int(100*t3/t1) + "%)" + #LF$ +
                 "t4 = " + t4 + " ms  (" + Int(100*t4/t1) + "%)" + #LF$ +
                 "t5 = " + t5 + " ms  (" + Int(100*t5/t1) + "%)" + #LF$ +
                 "t6 = " + t6 + " ms   (" + Int(100*t6/t1) + "%)" + #LF$ +
                 "t7 = " + t7 + " ms     (" + Int(100*t7/t1) + "%)")
Last edited by Little John on Fri May 15, 2015 5:15 am, edited 1 time in total.
Trond
Always Here
Always Here
Posts: 7446
Joined: Mon Sep 22, 2003 6:45 pm
Location: Norway

Re: Population count

Post by Trond »

On linux x64 it says popcnt: invalid instruction.

I made two non-asm version to compare. The second is actually faster than PopCount4().

Code: Select all

Procedure PopCount5(v.l, i = 0)
  If i < 32
    ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
  EndIf
EndProcedure

Procedure PopCount6(v.l, i = 0)
  For I = 0 To 31
    Agg + (v & 1 << i) >> i
  Next
  ProcedureReturn Agg
EndProcedure
User avatar
idle
Always Here
Always Here
Posts: 5018
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Population count

Post by idle »

swar method no branching

Code: Select all

Procedure Popcount32(x.l)
  ;x - (x >> 1) &  $55555555           
  !mov rax, [p.v_x]
  !mov rdx, rax
  !shr rdx, 1
  !and rdx, 0x55555555 
  !sub rax,rdx 
  ;x = (x & $33333333) + ((x >> 2) & $33333333)
  !mov rdx, rax       ;x 
  !and rax, 0x33333333 
  !shr rdx, 2 
  !and rdx, 0x33333333
  !add rax,rdx
  ;x = (x + (x >> 4)) & $0f0f0f0f0f
  !mov rdx, rax 
  !shr rdx, 4 
  !add rax,rdx 
  !and rax, 0x0f0f0f0f
  ;x * 0x01010101 >> 24
  !imul eax, 0x01010101
  !shr eax, 24
  ProcedureReturn  
EndProcedure 

x = Random($FFFFFFFF)
Debug Bin(x)
Debug Popcount32(x) 

Windows 11, Manjaro, Raspberry Pi OS
Image
User avatar
luis
Addict
Addict
Posts: 3876
Joined: Wed Aug 31, 2005 11:09 pm
Location: Italy

Re: Population count

Post by luis »

@idle, wow that's insane !

How it works explained (same code in C) -> http://www.playingwithpointers.com/swar.html
Last edited by luis on Fri May 15, 2015 10:55 am, edited 1 time in total.
"Have you tried turning it off and on again ?"
A little PureBasic review
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: Population count

Post by wilbert »

Very nice Idle.
By far the fastest method after the popcnt instruction. :)
Windows (x64)
Raspberry Pi OS (Arm64)
User avatar
idle
Always Here
Always Here
Posts: 5018
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Population count

Post by idle »

If you need a 64 bit popcount, it's the same but it can't be done with immediate's
(which makes me wonder if the previous one would need to be done like this for 32 bit x86)

Code: Select all

Procedure Popcount64(x.i)
  ;x - (x >> 1) &  $5555555555555555           
  !mov rax, [p.v_x]
  !mov rdx, rax
  !shr rdx, 1
  !mov r15, 0x5555555555555555
  !and rdx, r15
  !sub rax,rdx 
  ;x = (x & $3333333333333333) + ((x >> 2) & $3333333333333333)
  !mov rdx, rax       ;x 
  !mov r15, 0x3333333333333333 
  !and rax, r15  
  !shr rdx, 2 
  !and rdx, r15
  !add rax,rdx
  ;x = (x + (x >> 4)) & $0f0f0f0f0f0f0f0f
  !mov rdx, rax 
  !shr rdx, 4 
  !add rax,rdx 
  !mov r15, 0x0f0f0f0f0f0f0f0f
  !and rax, r15 
  ;x * 0101010101010101 >> 56
  !mov r15, 0x0101010101010101
  !imul rax, r15
  !shr rax, 56 
  ProcedureReturn  
EndProcedure 
Windows 11, Manjaro, Raspberry Pi OS
Image
Little John
Addict
Addict
Posts: 4519
Joined: Thu Jun 07, 2007 3:25 pm
Location: Berlin, Germany

Re: Population count

Post by Little John »

Trond and idle, many thanks for your posts!
I've included your code in the speed test above.
Trond wrote:On linux x64 it says popcnt: invalid instruction.
Same problem here (using Linux Mint 17.1).
On the same computer, the code works fine on Windows 7.
So it's actually a problem of (only some distributions of?) Linux.
Trond wrote:I made two non-asm version to compare. The second is actually faster than PopCount4().
This is a good example that smart PB code can be faster than badly written ASM code.
In my tests, even your recursive version is faster than my PopCount4(). :-)

Idle, your code looks like a magic trick to me. :-)
But it works. It's almost as fast as the 'popcnt' instruction.
Incredible!
(I took your first code and adapted it for 32 bit.)
User avatar
idle
Always Here
Always Here
Posts: 5018
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Population count

Post by idle »

The problem with popcnt on linux is the fasm version
I just updated it to the latest version and then it works

Code: Select all

Procedure PopCountSSE4(x.i)
  !popcnt rax,[p.v_x]   
  ProcedureReturn 
EndProcedure   

x = Random($FFFFFFFF)
Debug Bin(x)
Debug PopCountSSE4(x) 
Windows 11, Manjaro, Raspberry Pi OS
Image
said
Enthusiast
Enthusiast
Posts: 342
Joined: Thu Apr 14, 2011 6:07 pm

Re: Population count

Post by said »

For fun, i took Trond's PopCount6 and removed the loop for, with plain silly repetition block see in PopCount8() the result is interesting (72% vs 340% on my PC)

Code: Select all


EnableExplicit

Procedure.l PopCount0 (k.l)
   ! mov    eax, [p.v_k]
   ! popcnt eax, eax
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount1 (k.l)
   ; -- wilbert
   
   !xor eax, eax
   !mov edx, [p.v_k]
   !and edx, edx
   !jz popcount_m2_done
   
   !popcount_m2_loop:
   !inc eax
   !lea ecx, [edx - 1]
   !and edx, ecx
   !jnz popcount_m2_loop
   
   !popcount_m2_done:
   ProcedureReturn
EndProcedure


Procedure.l PopCount2 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount2:
   !shl edx, 1
   !adc eax, 0
   !dec ecx
   !jnz popcount2
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount3 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount3:
   !shl edx, 1
   !adc eax, 0
   !loop popcount3
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount4 (k.l)
   ; -- LJ
   
   ! mov  edx, [p.v_k]
   ! xor  eax, eax
   ! mov  ecx, 32
   
   ! popcount_again:
   ! dec  ecx
   ! bt   edx, ecx
   ! jnc  popcount_notset
   ! inc  eax
   ! popcount_notset:   
   ! or   ecx, ecx
   ! jnz  popcount_again
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount5 (v.l, i = 0)
   ; -- Trond
   
   If i < 32
      ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
   EndIf
EndProcedure


Procedure.l PopCount6 (v.l)
   ; -- Trond
   
   Protected i.i, Agg.l
   
   For i = 0 To 31
      Agg + (v & 1 << i) >> i
   Next
   ProcedureReturn Agg
EndProcedure


Procedure.l Popcount7 (x.l)
   ; -- idle (adapted for 32 bit)
   
   ;x - (x >> 1) &  $55555555           
   !mov eax, [p.v_x]
   !mov edx, eax
   !shr edx, 1
   !and edx, 0x55555555
   !sub eax, edx
   ;x = (x & $33333333) + ((x >> 2) & $33333333)
   !mov edx, eax       ;x
   !and eax, 0x33333333
   !shr edx, 2
   !and edx, 0x33333333
   !add eax, edx
   ;x = (x + (x >> 4)) & $0f0f0f0f0f
   !mov edx, eax
   !shr edx, 4
   !add eax, edx
   !and eax, 0x0f0f0f0f
   ;x * 0x01010101 >> 24
   !imul eax, 0x01010101
   !shr eax, 24
   ProcedureReturn 
EndProcedure 

Procedure.l Popcount8(v.l)
    ; --- Trond without a loop
    Protected y.l
    
    y = (v & 1 <<  0) >>  0 +
        (v & 1 <<  1) >>  1 +
        (v & 1 <<  2) >>  2 +
        (v & 1 <<  3) >>  3 +
        (v & 1 <<  4) >>  4 +
        (v & 1 <<  5) >>  5 +
        (v & 1 <<  6) >>  6 +
        (v & 1 <<  7) >>  7 +
        (v & 1 <<  8) >>  8 +
        (v & 1 <<  9) >>  9 +
        (v & 1 << 10) >> 10 +
        (v & 1 << 11) >> 11 +
        (v & 1 << 12) >> 12 +
        (v & 1 << 13) >> 13 +
        (v & 1 << 14) >> 14 +
        (v & 1 << 15) >> 15 +
        (v & 1 << 16) >> 16 +
        (v & 1 << 17) >> 17 +
        (v & 1 << 18) >> 18 +
        (v & 1 << 19) >> 19 +
        (v & 1 << 20) >> 20 +
        (v & 1 << 21) >> 21 +
        (v & 1 << 22) >> 22 +
        (v & 1 << 23) >> 23 +
        (v & 1 << 24) >> 24 +
        (v & 1 << 25) >> 25 +
        (v & 1 << 26) >> 26 +
        (v & 1 << 27) >> 27 +
        (v & 1 << 28) >> 28 +
        (v & 1 << 29) >> 29 +
        (v & 1 << 30) >> 30 +
        (v & 1 << 31) >> 31 
    
    ProcedureReturn y
EndProcedure

; ---- Initialisation ----
Define.i i, x, rep=10000000
Define.i t0, t1, t2, t3, t4, t5, t6, t7, t8

Dim Rnd.l(rep)
For i = 1 To rep
   Rnd(i) = Random(2147483647)
Next


; ---- Small check whether all procedures return the same results ----
For i = 1 To 100
   x = PopCount0(Rnd(i))
   If x <> PopCount1(Rnd(i)) Or
      x <> PopCount2(Rnd(i)) Or
      x <> PopCount3(Rnd(i)) Or
      x <> PopCount4(Rnd(i)) Or
      x <> PopCount5(Rnd(i)) Or
      x <> PopCount6(Rnd(i)) Or
      x <> PopCount7(Rnd(i)) Or
      x <> PopCount8(Rnd(i)) 
      
      MessageRequester("Error",
                       "Different results for PopCount(" + Rnd(i) + ")")
      End
   EndIf
Next   


; ---- Speed test ----
t0 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount0(Rnd(i))
Next   
t0 = ElapsedMilliseconds() - t0

t1 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount1(Rnd(i))
Next   
t1 = ElapsedMilliseconds() - t1

t2 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount2(Rnd(i))
Next   
t2 = ElapsedMilliseconds() - t2

t3 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount3(Rnd(i))
Next   
t3 = ElapsedMilliseconds() - t3

t4 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount4(Rnd(i))
Next   
t4 = ElapsedMilliseconds() - t4

t5 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount5(Rnd(i))
Next   
t5 = ElapsedMilliseconds() - t5

t6 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount6(Rnd(i))
Next   
t6 = ElapsedMilliseconds() - t6

t7 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount7(Rnd(i))
Next   
t7 = ElapsedMilliseconds() - t7

t8 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount8(Rnd(i))
Next   
t8 = ElapsedMilliseconds() - t8

MessageRequester("PopCount speed test",
                 "t0 = " + t0 + " ms     (" + Int(100*t0/t1) + "%)" + #LF$ +
                 "t1 = " + t1 + " ms   (100%)" + #LF$ +
                 "t2 = " + t2 + " ms   (" + Int(100*t2/t1) + "%)" + #LF$ +
                 "t3 = " + t3 + " ms   (" + Int(100*t3/t1) + "%)" + #LF$ +
                 "t4 = " + t4 + " ms   (" + Int(100*t4/t1) + "%)" + #LF$ +
                 "t5 = " + t5 + " ms   (" + Int(100*t5/t1) + "%)" + #LF$ +
                 "t6 = " + t6 + " ms   (" + Int(100*t6/t1) + "%)" + #LF$ +
                 "t7 = " + t7 + " ms   (" + Int(100*t7/t1) + "%)" + #LF$ +
                 "t8 = " + t8 + " ms   (" + Int(100*t8/t1) + "%)")
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: Population count

Post by wilbert »

idle wrote:The problem with popcnt on linux is the fasm version
I just updated it to the latest version and then it works
If for some reason you wouldn't want to do that you can define the opcode with some bytes

Code: Select all

Procedure.l PopCount(x.l)
  !mov eax, [p.v_x]
  !dd 0xc0b80ff3; popcnt eax, eax
  ProcedureReturn 
EndProcedure
Strange that the Linux version of PureBasic doesn't have the same assembler version.
I thought both Linux and Windows used Fasm.
On the other hand, on OS X we have Nasm for x86 and Yasm for x64. :shock: (no idea why not the same is used for both).
Windows (x64)
Raspberry Pi OS (Arm64)
User avatar
idle
Always Here
Always Here
Posts: 5018
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Population count

Post by idle »

wilbert wrote: Strange that the Linux version of PureBasic doesn't have the same assembler version.
I thought both Linux and Windows used Fasm.
On the other hand, on OS X we have Nasm for x86 and Yasm for x64. :shock: (no idea why not the same is used for both).

I guess Fred forgot to update fasm for 5.22 linux x64 or maybe there was an unresolved issue with fasm around the time of 5.22 release?

The different assemblers kind of preclude using preprocessor logic in the assembler. it's a shame fasm
hasn't built a mac version yet.
Windows 11, Manjaro, Raspberry Pi OS
Image
davido
Addict
Addict
Posts: 1890
Joined: Fri Nov 09, 2012 11:04 pm
Location: Uttoxeter, UK

Re: Population count

Post by davido »

@Little John,
Thank you for bringing this subject up.
I had written a PB version by the checking each bit. It runs at 600%.
I've chosen the PB version of the method described by idle; it runs at 210%.
I present it, just in case it can be improved upon:

Code: Select all


EnableExplicit

Procedure.l PopCount0 (k.l)
   ! mov    eax, [p.v_k]
   ! popcnt eax, eax
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount1 (k.l)
   ; -- wilbert
   
   !xor eax, eax
   !mov edx, [p.v_k]
   !and edx, edx
   !jz popcount_m2_done
   
   !popcount_m2_loop:
   !inc eax
   !lea ecx, [edx - 1]
   !and edx, ecx
   !jnz popcount_m2_loop
   
   !popcount_m2_done:
   ProcedureReturn
EndProcedure


Procedure.l PopCount2 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount2:
   !shl edx, 1
   !adc eax, 0
   !dec ecx
   !jnz popcount2
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount3 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount3:
   !shl edx, 1
   !adc eax, 0
   !loop popcount3
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount4 (k.l)
   ; -- LJ
   
   ! mov  edx, [p.v_k]
   ! xor  eax, eax
   ! mov  ecx, 32
   
   ! popcount_again:
   ! dec  ecx
   ! bt   edx, ecx
   ! jnc  popcount_notset
   ! inc  eax
   ! popcount_notset:   
   ! or   ecx, ecx
   ! jnz  popcount_again
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount5 (v.l, i = 0)
   ; -- Trond
   
   If i < 32
      ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
   EndIf
EndProcedure


Procedure.l PopCount6 (v.l)
   ; -- Trond
   
   Protected i.i, Agg.l
   
   For i = 0 To 31
      Agg + (v & 1 << i) >> i
   Next
   ProcedureReturn Agg
EndProcedure


Procedure.l Popcount7 (x.l)
   ; -- idle (adapted for 32 bit)
   
   ;x - (x >> 1) &  $55555555           
   !mov eax, [p.v_x]
   !mov edx, eax
   !shr edx, 1
   !and edx, 0x55555555
   !sub eax, edx
   ;x = (x & $33333333) + ((x >> 2) & $33333333)
   !mov edx, eax       ;x
   !and eax, 0x33333333
   !shr edx, 2
   !and edx, 0x33333333
   !add eax, edx
   ;x = (x + (x >> 4)) & $0f0f0f0f0f
   !mov edx, eax
   !shr edx, 4
   !add eax, edx
   !and eax, 0x0f0f0f0f
   ;x * 0x01010101 >> 24
   !imul eax, 0x01010101
   !shr eax, 24
   ProcedureReturn 
EndProcedure 

Procedure.l Popcount8(v.l)
    ; --- Trond without a loop
    Protected y.l
    
    y = (v & 1 <<  0) >>  0 +
        (v & 1 <<  1) >>  1 +
        (v & 1 <<  2) >>  2 +
        (v & 1 <<  3) >>  3 +
        (v & 1 <<  4) >>  4 +
        (v & 1 <<  5) >>  5 +
        (v & 1 <<  6) >>  6 +
        (v & 1 <<  7) >>  7 +
        (v & 1 <<  8) >>  8 +
        (v & 1 <<  9) >>  9 +
        (v & 1 << 10) >> 10 +
        (v & 1 << 11) >> 11 +
        (v & 1 << 12) >> 12 +
        (v & 1 << 13) >> 13 +
        (v & 1 << 14) >> 14 +
        (v & 1 << 15) >> 15 +
        (v & 1 << 16) >> 16 +
        (v & 1 << 17) >> 17 +
        (v & 1 << 18) >> 18 +
        (v & 1 << 19) >> 19 +
        (v & 1 << 20) >> 20 +
        (v & 1 << 21) >> 21 +
        (v & 1 << 22) >> 22 +
        (v & 1 << 23) >> 23 +
        (v & 1 << 24) >> 24 +
        (v & 1 << 25) >> 25 +
        (v & 1 << 26) >> 26 +
        (v & 1 << 27) >> 27 +
        (v & 1 << 28) >> 28 +
        (v & 1 << 29) >> 29 +
        (v & 1 << 30) >> 30 +
        (v & 1 << 31) >> 31 
    
    ProcedureReturn y
EndProcedure


Procedure popcount9(i.l)
  Protected j.i
  j = (i >> 1) & $55555555;
  i = i - j; // (A)
  i = (i & $33333333) + ((i >> 2) & $33333333); // (B)
  i = (i & $0F0F0F0F) + ((i >> 4) & $0F0F0F0F); // (C)
  i = i * $01010101; // (D)
  ProcedureReturn i >> 24;
EndProcedure

Procedure NumOfBitsSet(N.q)
  ;=======================================================
  ;Counts the number of bits set in N and returns the
  ;count as an integer.
  ;=======================================================
  Protected Count.i
  If N < 0
    ProcedureReturn -1
  Else
    While N > 0
      If N & 1 = 1
        Count + 1
      EndIf
      N >> 1
    Wend 
    ProcedureReturn Count
  EndIf
EndProcedure

; ---- Initialisation ----
Define.i i, x, rep=10000000
Define.i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta

Dim Rnd.l(rep)
For i = 1 To rep
   Rnd(i) = Random(2147483647)
Next


; ---- Small check whether all procedures return the same results ----
For i = 1 To 100
   x = PopCount0(Rnd(i))
   If x <> PopCount1(Rnd(i)) Or
      x <> PopCount2(Rnd(i)) Or
      x <> PopCount3(Rnd(i)) Or
      x <> PopCount4(Rnd(i)) Or
      x <> PopCount5(Rnd(i)) Or
      x <> PopCount6(Rnd(i)) Or
      x <> PopCount7(Rnd(i)) Or
      x <> PopCount8(Rnd(i)) Or 
      x <> PopCount9(Rnd(i)) Or
      x <> NumOfBitsSet(Rnd(i)) 
      
      MessageRequester("Error",
                       "Different results for PopCount(" + Rnd(i) + ")")
      End
   EndIf
Next   


; ---- Speed test ----
t0 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount0(Rnd(i))
Next   
t0 = ElapsedMilliseconds() - t0

t1 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount1(Rnd(i))
Next   
t1 = ElapsedMilliseconds() - t1

t2 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount2(Rnd(i))
Next   
t2 = ElapsedMilliseconds() - t2

t3 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount3(Rnd(i))
Next   
t3 = ElapsedMilliseconds() - t3

t4 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount4(Rnd(i))
Next   
t4 = ElapsedMilliseconds() - t4

t5 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount5(Rnd(i))
Next   
t5 = ElapsedMilliseconds() - t5

t6 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount6(Rnd(i))
Next   
t6 = ElapsedMilliseconds() - t6

t7 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount7(Rnd(i))
Next   
t7 = ElapsedMilliseconds() - t7

t8 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount8(Rnd(i))
Next   
t8 = ElapsedMilliseconds() - t8

t9 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount9(Rnd(i))
Next   
t9 = ElapsedMilliseconds() - t9

ta = ElapsedMilliseconds()
For i = 1 To rep
   x = NumOfBitsSet(Rnd(i))
Next   
ta = ElapsedMilliseconds() - ta

MessageRequester("PopCount speed test",
                 "t0 = " + t0 + " ms     (" + Int(100*t0/t1) + "%) (Intel)" + #LF$ +
                 "t1 = " + t1 + " ms   (100%) (wilbert)" + #LF$ +
                 "t2 = " + t2 + " ms   (" + Int(100*t2/t1) + "%) (luis)" + #LF$ +
                 "t3 = " + t3 + " ms   (" + Int(100*t3/t1) + "%) (luis2)" + #LF$ +
                 "t4 = " + t4 + " ms   (" + Int(100*t4/t1) + "%) Little John)" + #LF$ +
                 "t5 = " + t5 + " ms   (" + Int(100*t5/t1) + "%) (Trond)" + #LF$ +
                 "t6 = " + t6 + " ms   (" + Int(100*t6/t1) + "%) (Trond2)" + #LF$ +
                 "t7 = " + t7 + " ms   (" + Int(100*t7/t1) + "%) (idle)" + #LF$ +
                 "t8 = " + t8 + " ms   (" + Int(100*t8/t1) + "%) (said version of trond unrolled)" + #LF$ +
                 "t9 = " + t9 + " ms   (" + Int(100*t9/t1) + "%) (idle-PB)" + #LF$ +
                 "ta = " + ta + " ms   (" + Int(100*ta/t1) + "%) (davido)")
DE AA EB
Post Reply