I hope I didn't make any serious mistakes. The ASM code was a nightmare to debug

The algorithm itself is unaltered but I did make some changes to the implementation to benefit from some SSE2 features.
I included a 4x4 ordered dither and the code should be cross platform compatible.
Module
Code: Select all
DeclareModule NeuQuant; v1.0.3
;/* NeuQuant Neural-Net Quantization Algorithm
; * Copyright (c) 1994 Anthony Dekker
; *
; * NEUQUANT Neural-Net quantization algorithm by Anthony Dekker, 1994.
; * See "Kohonen neural networks for optimal colour quantization"
; * in "Network: Computation in Neural Systems" Vol. 5 (1994) pp 351-367.
; * for a discussion of the algorithm.
; * See also http://members.ozemail.com.au/~dekker/NEUQUANT.HTML
; *
; * Any party obtaining a copy of these files from the author, directly or
; * indirectly, is granted, free of charge, a full and unrestricted irrevocable,
; * world-wide, paid up, royalty-free, nonexclusive right and license to deal
; * in this software and documentation files (the "Software"), including without
; * limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
; * and/or sell copies of the Software, and to permit persons who receive
; * copies from any such party to do so, with the only requirement being
; * that this copyright notice remain intact.
; */
; Ported to PureBasic ASM by Wilbert (SSE2 required)
Declare InitNet()
Declare UnbiasNet(*palette)
Declare.i InxBuild(reserveTransparent = #False)
Declare.i InxSearch(rgb.l)
Declare Learn(image.i, sample = 10)
; helper procedures not part of actual NeuQuant
Declare InxBuildFromPalette(Array palette(1))
Declare.l PointOrdDith(x, y)
EndDeclareModule
Module NeuQuant
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
; 64 bit Macros
#x64 = #True
Macro LoadPtrNQ
!mov rdx, [neuquant.p_NQ]
EndMacro
Macro LoadXMM(xmm_reg)
!movdqa xmm_reg, [rdx]
EndMacro
Macro SaveXMM(xmm_reg)
!movdqa [rdx], xmm_reg
EndMacro
Macro SaveDWord(dwrd)
!mov dword [rdx], dwrd
EndMacro
CompilerElse
; 32 bit Macros
#x64 = #False
Macro LoadPtrNQ
!mov edx, [neuquant.p_NQ]
EndMacro
Macro LoadXMM(xmm_reg)
!movdqa xmm_reg, [edx]
EndMacro
Macro SaveXMM(xmm_reg)
!movdqa [edx], xmm_reg
EndMacro
Macro SaveDWord(dwrd)
!mov dword [edx], dwrd
EndMacro
CompilerEndIf
Structure Pixel
StructureUnion
w.w[8]
l.l[4]
q.q[2]
EndStructureUnion
EndStructure
Structure NQ
network.Pixel[256] ; offset 0
freq.l[256] ; offset 4096
bias.l[256] ; offset 5120
netindex.l[256] ; offset 6144
radpower.l[32] ; offset 7168
EndStructure
Global *NQ_ = AllocateMemory(SizeOf(NQ) + 65536, #PB_Memory_NoClear)
Global *NQ.NQ = *NQ_ & -65536 + 65536
Procedure InitNet()
LoadPtrNQ
!xor ecx, ecx ; ecx = index
!neuquant.l_initnet0: ; init network values
!movd xmm0, ecx
!pslld xmm0, 4
!pshufd xmm0, xmm0, 0xc0
SaveXMM(xmm0)
!add dx, 16
!inc cl
!jnz neuquant.l_initnet0
!neuquant.l_initnet1: ; init freq values
SaveDWord(256)
!add dx, 4
!inc cl
!jnz neuquant.l_initnet1
!neuquant.l_initnet2: ; init bias values
SaveDWord(0)
!add dx, 4
!inc cl
!jnz neuquant.l_initnet2
EndProcedure
Procedure UnbiasNet(*palette)
; Modified version of UnbiasNet.
; Outputs 8 words idx-0-g-0-0-b-g-r
; Modification made to make search faster
LoadPtrNQ
!pcmpeqd xmm1, xmm1
!psrld xmm1, 31
!pslld xmm1, 3
!pcmpeqd xmm2, xmm2
!psrlw xmm2, 8
!psrlq xmm2, 16
!pshufhw xmm2, xmm2, 0xf7
!pcmpeqd xmm3, xmm3
!pslld xmm3, 24
CompilerIf #x64
!mov rax, [p.p_palette]
CompilerElse
!mov eax, [p.p_palette]
CompilerEndIf
!xor ecx, ecx ; ecx = index
!neuquant.l_unbiasnet0:
LoadXMM(xmm0)
!paddd xmm0, xmm1
!psrld xmm0, 4
!packssdw xmm0, xmm0
!pminsw xmm0, xmm2 ; limit to 255 max
!pinsrw xmm0, ecx, 7 ; insert index
SaveXMM(xmm0)
!packuswb xmm0, xmm0 ; output palette value
!por xmm0, xmm3
CompilerIf #x64
!movd [rax], xmm0
!add rax, 4
CompilerElse
!movd [eax], xmm0
!add eax, 4
CompilerEndIf
!add dx, 16
!inc cl
!jnz neuquant.l_unbiasnet0
EndProcedure
Macro AlterNeighASM
!shl dx, 2
CompilerIf #x64
!movd xmm2, [rdx + 7168] ; *NQ\radpower
CompilerElse
!movd xmm2, [edx + 7168] ; *NQ\radpower
CompilerEndIf
!pshufd xmm2, xmm2, 0xc0
!mov dx, cx
!shl dx, 4
LoadXMM(xmm0)
!movdqa xmm3, xmm0
!psubd xmm3, xmm1 ; px.rgb - rgb
!pshufd xmm5, xmm3, 0xf5
!pmuludq xmm5, xmm2
!pmuludq xmm3, xmm2
!psllq xmm5, 32
!por xmm3, xmm5 ; a*(px.rgb - rgb)
!movdqa xmm5, xmm3 ; add sign bit
!psrld xmm5, 31 ; so negative values
!paddd xmm3, xmm5 ; divide okay
!psrad xmm3, 18 ; a*(px.rgb - rgb)/alpharadbias
!psubd xmm0, xmm3 ; px.rgb - a*(px.rgb - rgb)/alpharadbias
SaveXMM(xmm0)
EndMacro
Procedure ContestAndAlter(alpha, rad, rgb.l)
!movd xmm1, [p.v_rgb] ; expand color
!pxor xmm5, xmm5
!punpcklbw xmm1, xmm5
!punpcklwd xmm1, xmm5
!pslld xmm1, 4 ; rgb << 4
; contest
LoadPtrNQ
!pcmpeqd xmm4, xmm4 ; xmm4 = bestbiasd, bestbiaspos, bestd, bestpos
!psrlq xmm4, 1 ; xmm4 = init xmm4 to (0x7fffffff, -1, 0x7fffffff, -1)
!xor ecx, ecx ; ecx = index
!neuquant.l_contest0:
!mov dx, cx
!shl dx, 4
LoadXMM(xmm0) ; load network[index] into xmm0
!psubw xmm0, xmm1 ; subtract pixel
!pxor xmm5, xmm5
!psubw xmm5, xmm0
!pmaxsw xmm0, xmm5 ; convert difference to absolute
!pshufd xmm5, xmm0, 2 ; add 3 abs diffs together
!paddw xmm0, xmm5
!psllq xmm5, 32
!paddw xmm0, xmm5
!pinsrw xmm0, ecx, 0 ; insert index
!pshufd xmm0, xmm0, 0x44 ; xmm0 = dist, index, dist, index
!shr dx, 2
!add dx, 4096
CompilerIf #x64
!mov eax, [rdx] ; eax = *NQ\freq[i]
!sar eax, 10
!sub [rdx], eax ; *NQ\freq[i] - betafreq
!add dx, 1024
!movd xmm5, [rdx]
!psrad xmm5, 12
!pshufd xmm5, xmm5, 0x15
!sal eax, 10
!add [rdx], eax ; *NQ\bias[i] + betafreq << 10
CompilerElse
!mov eax, [edx] ; eax = *NQ\freq[i]
!sar eax, 10
!sub [edx], eax ; *NQ\freq[i] - betafreq
!add dx, 1024
!movd xmm5, [edx]
!psrad xmm5, 12
!pshufd xmm5, xmm5, 0x15
!sal eax, 10
!add [edx], eax ; *NQ\bias[i] + betafreq << 10
CompilerEndIf
!psubd xmm0, xmm5 ; xmm0 = biasdist, index, dist, index
!movdqa xmm5, xmm4 ; update best values
!pcmpgtd xmm5, xmm0
!pxor xmm0, xmm4
!pshufd xmm5, xmm5, 0xf5
!pand xmm0, xmm5
!pxor xmm4, xmm0
!inc cl
!jnz neuquant.l_contest0
!pextrw eax, xmm4, 0
!mov dx, ax
!shl dx, 2
CompilerIf #x64
!add dword [rdx + 4096], 0x40
!sub dword [rdx + 5120], 0x10000
CompilerElse
!add dword [edx + 4096], 0x40
!sub dword [edx + 5120], 0x10000
CompilerEndIf
; alter single
!movd xmm2, [p.v_alpha]
!pshufd xmm2, xmm2, 0
!pextrw eax, xmm4, 4 ; bestbiaspos
!mov dx, ax
!shl dx, 4
LoadXMM(xmm0)
!movdqa xmm3, xmm0
!psubw xmm3, xmm1 ; px.rgb - rgb
!movdqa xmm5, xmm3
!pmullw xmm3, xmm2
!pmulhw xmm5, xmm2
!pslld xmm5, 16
!por xmm3, xmm5 ; alpha*(px.rgb - rgb)
!movdqa xmm5, xmm3 ; add sign bit
!psrld xmm5, 31 ; so negative values
!paddd xmm3, xmm5 ; divide okay
!psrad xmm3, 10 ; alpha*(px.rgb - rgb)/initalpha
!psubd xmm0, xmm3 ; px.rgb - alpha*(px.rgb - rgb)/initalpha
SaveXMM(xmm0)
; alter neighbours
!mov ecx, [p.v_rad]
!and ecx, ecx
!jz neuquant.l_alter4
CompilerIf #x64
!mov r8, rbx
CompilerElse
!movd xmm4, ebx
CompilerEndIf
!mov ebx, eax
!mov ecx, eax
!add eax, [p.v_rad]
!cmp eax, 256
!jle neuquant.l_alter1
!mov eax, 256
!jmp neuquant.l_alter1
!neuquant.l_alter0:
!mov dx, cx
!sub dx, bx
AlterNeighASM
!neuquant.l_alter1:
!inc ecx
!cmp ecx, eax
!jl neuquant.l_alter0
!mov eax, ebx
!mov ecx, eax
!sub eax, [p.v_rad]
!cmp eax, -1
!jge neuquant.l_alter3
!mov eax, -1
!jmp neuquant.l_alter3
!neuquant.l_alter2:
!mov dx, bx
!sub dx, cx
AlterNeighASM
!neuquant.l_alter3:
!dec ecx
!cmp ecx, eax
!jg neuquant.l_alter2
CompilerIf #x64
!mov rbx, r8
CompilerElse
!movd ebx, xmm4
CompilerEndIf
!neuquant.l_alter4:
EndProcedure
; Search for RGB value (after net is unbiased) and return colour index
Procedure.i InxSearch(rgb.l)
!movd xmm1, [p.v_rgb]
!pcmpeqd xmm4, xmm4
!punpcklbw xmm1, xmm1
!psrlw xmm4, 8
!psrlq xmm4, 16
!pshufd xmm1, xmm1, 0x44
!pshufhw xmm4, xmm4, 0xf7
!pand xmm1, xmm4
LoadPtrNQ
!movzx ecx, byte [p.v_rgb + 1]
CompilerIf #x64
!push rbx
!mov cl, [rdx + rcx * 4 + 6144]
CompilerElse
!push ebx
!mov cl, [edx + ecx * 4 + 6144]
CompilerEndIf
!sub cl, 1
!adc cl, 1
!mov ch, cl
!dec ch
!mov eax, 0x300 ; eax = bestd
!movd xmm2, eax
!pshufd xmm2, xmm2, 0x44
!neuquant.l_search0:
!cmp cl, 0
!jz neuquant.l_search2
!movzx dx, cl
!shl dx, 4
LoadXMM(xmm0)
!pand xmm0, xmm4
!psadbw xmm0, xmm1 ; dist
!movdqa xmm5, xmm2
!pcmpgtw xmm5, xmm0 ; bestd > dist ?
!pmovmskb ebx, xmm5
!shr ebx, 1
!jnc neuquant.l_search1
CompilerIf #x64
!movzx eax, word [rdx + 14]
CompilerElse
!movzx eax, word [edx + 14]
CompilerEndIf
!pshufd xmm2, xmm0, 0x44
!neuquant.l_search1:
!inc cl
!sar bl, 7
!and cl, bl
!neuquant.l_search2:
!cmp ch, 0xff
!jz neuquant.l_search4
!movzx dx, ch
!shl dx, 4
LoadXMM(xmm0)
!pand xmm0, xmm4
!psadbw xmm0, xmm1 ; dist
!movdqa xmm5, xmm2
!pcmpgtw xmm5, xmm0 ; bestd > dist ?
!pmovmskb ebx, xmm5
!shr ebx, 1
!jnc neuquant.l_search3
CompilerIf #x64
!movzx eax, word [rdx + 14]
CompilerElse
!movzx eax, word [edx + 14]
CompilerEndIf
!pshufd xmm2, xmm0, 0x44
!neuquant.l_search3:
!sar bl, 7
!and ch, bl
!dec ch
!neuquant.l_search4:
!cmp cx, 0xff00
!jnz neuquant.l_search0
CompilerIf #x64
!pop rbx
CompilerElse
!pop ebx
CompilerEndIf
ProcedureReturn
EndProcedure
; Insertion sort of network and building of *NQ\netindex[0..255] (to do after unbias)
Procedure.i InxBuild(reserveTransparent = #False)
Protected.i i, j, smallpos, smallval
Protected.i previouscol, startpos, transparent
previouscol = 0
startpos = 0
For i = 0 To 255
smallpos = i
smallval = *NQ\network[i]\w[1]; index on g
; find smallest in i..255
For j = i + 1 To 255
If *NQ\network[j]\w[1] < smallval
smallpos = j
smallval = *NQ\network[j]\w[1]
EndIf
Next
; swap if smallpos <> i
If i <> smallpos
Swap *NQ\network[i]\q[0], *NQ\network[smallpos]\q[0]
Swap *NQ\network[i]\q[1], *NQ\network[smallpos]\q[1]
EndIf
; smallval entry is now in position i
If smallval <> previouscol
*NQ\netindex[previouscol] = (startpos + i) >> 1
j = previouscol + 1 : While j < smallval
*NQ\netindex[j] = i
j + 1 : Wend
previouscol = smallval
startpos = i
EndIf
Next
*NQ\netindex[previouscol] = (startpos + 255) >> 1
For j = previouscol + 1 To 255 : *NQ\netindex[j] = 255 : Next
If reserveTransparent
transparent = *NQ\network[1]\w[7]
*NQ\network[1]\w[7] = *NQ\network[0]\w[7]
EndIf
ProcedureReturn transparent
EndProcedure
; Main Learning Loop
Procedure Learn(image.i, sample = 10)
Protected.i i, j, x, y, width, height, step_x, step_y
Protected.i radius, rad, alpha, delta, totalpixels, samplepixels, alphadec
alphadec = 30 + (sample - 1) / 3
If IsImage(Image)
width = ImageWidth(Image)
height = ImageHeight(Image)
totalpixels = width * height
samplepixels = totalpixels / sample
delta = samplepixels / 100
alpha = 1024
radius = 2048
rad = radius >> 6
If rad <= 1 : rad = 0 : EndIf
i = 0 : While i < rad
*NQ\radpower[i] = alpha * (rad*rad - i*i) << 8 / (rad*rad)
i + 1 : Wend
If totalpixels % 499
If totalpixels > 499
step_y = 499 / width
step_x = 499 - step_y * width
Else
samplepixels = totalpixels
delta = samplepixels / 100
If totalpixels > 7
step_y = 7 / width
step_x = 7 - step_y * width
Else
step_x = 1
EndIf
EndIf
ElseIf totalpixels % 491
step_y = 491 / width
step_x = 491 - step_y * width
ElseIf totalpixels % 487
step_y = 487 / width
step_x = 487 - step_y * width
Else
step_y = 503 / width
step_x = 503 - step_y * width
EndIf
i = 0
StartDrawing(ImageOutput(Image))
While samplepixels
ContestAndAlter(alpha, rad, Point(x, y))
x + step_x
If x >= width : x - width : y + 1 : EndIf
y + step_y
If y >= height : y - height : EndIf
!inc dword [p.v_i]
If i = delta
alpha - alpha / alphadec
radius - radius / 15
rad = radius >> 6
If rad <= 1 : rad = 0 : EndIf
j = 0 : While j < rad
*NQ\radpower[j] = alpha * (rad*rad - j*j) << 8 / (rad*rad)
j + 1 : Wend
i = 0
EndIf
!dec dword [p.v_samplepixels]
Wend
StopDrawing()
EndIf
EndProcedure
Procedure InxBuildFromPalette(Array palette(1))
Protected.i i, c, s = ArraySize(palette())
FillMemory(*NQ, 4096)
While i <= s
c = palette(i)
*NQ\network[i]\w[0] = c & $ff
*NQ\network[i]\w[1] = c >> 8 & $ff
*NQ\network[i]\w[2] = c >> 16 & $ff
*NQ\network[i]\w[5] = *NQ\network[i]\w[1]
*NQ\network[i]\w[7] = i
i + 1
Wend
While i < 256
*NQ\network[i]\q[0] = $000000ff00ff00ff
*NQ\network[i]\q[1] = $00ff000000ff0000
i + 1
Wend
InxBuild()
i = s + 1
c = *NQ\network[s]\w[7]
While i < 256
*NQ\network[i]\w[7] = c
i + 1
Wend
EndProcedure
Procedure.l PointOrdDith(x, y)
Protected.l c = Point(x, y)
!movd xmm0, [p.v_c]
!mov al, [p.v_x]
!mov ah, [p.v_y]
!shl al, 6
!and eax, 0x3c0
!shr ax, 3
!punpcklbw xmm0, xmm0
CompilerIf #x64
!lea rdx, [neuquant.l_pointorddith]
!movq xmm1, [rdx + rax]
CompilerElse
!movq xmm1, [neuquant.l_pointorddith + eax]
CompilerEndIf
!psrlw xmm0, 8
!paddw xmm0, xmm1
!packuswb xmm0, xmm0
!movd eax, xmm0
ProcedureReturn
!neuquant.l_pointorddith:
!dq 0xfff9fff9fff9, 0x000100010001, 0xfffbfffbfffb, 0x000300030003
!dq 0x000500050005, 0xfffdfffdfffd, 0x000700070007, 0xffffffffffff
!dq 0xfffcfffcfffc, 0x000400040004, 0xfffafffafffa, 0x000200020002
!dq 0x000800080008, 0x000000000000, 0x000600060006, 0xfffefffefffe
EndProcedure
EndModule
Code: Select all
UseJPEGImageDecoder()
Dim Palette.l(255)
Define.i t0s, t0e, cnt, t1s, t1e, x, y, max_x, max_y, i
If OpenWindow(0, 0, 0, 600, 600, "", #PB_Window_SystemMenu | #PB_Window_ScreenCentered)
If LoadImage(0, "test.jpg")
t1s = ElapsedMilliseconds()
NeuQuant::InitNet()
NeuQuant::Learn(0)
NeuQuant::UnbiasNet(@Palette())
NeuQuant::InxBuild()
StartDrawing(ImageOutput(0))
max_x = ImageWidth(0) - 1
max_y = ImageHeight(0) - 1
For y = 0 To max_y
For x = 0 To max_x
i = NeuQuant::InxSearch(NeuQuant::PointOrdDith(x, y)); 4x4 ordered dither
;i = NeuQuant::InxSearch(Point(x, y)); no dither
Plot(x, y, Palette(i))
Next
Next
StopDrawing()
t1e = ElapsedMilliseconds()
SetWindowTitle(0, Str(t1e-t1s))
ImageGadget(0, 0, 0, 600, 600, ImageID(0))
EndIf
Repeat : Until WaitWindowEvent() = #PB_Event_CloseWindow
EndIf