Multi-threaded Separable Box Blur

Share your advanced PureBasic knowledge/code with the community.
punak
Enthusiast
Enthusiast
Posts: 113
Joined: Tue Sep 07, 2021 12:08 pm

Multi-threaded Separable Box Blur

Post by punak »

Hi all,
I needed a fast algorithm for image blurring for a task. Artificial intelligence wrote this code for me and of course it worked properly with some debugging. experiment with testing the speed of blurring your images.
I haven't tried it on x64 yet.

Code: Select all

;==============================================================
; Multi-threaded Separable Box Blur 
; - Linear pointers (AllocateMemory)
; - No RGBA() (bit-shift write)
; - Separable (horizontal then vertical)
; - Multithreaded: horizontal pass parallel by rows, vertical pass parallel by columns
; - Written by AI in collaboration With Punak
;==============================================================

EnableExplicit

; === Editable ===
#THREADS = 4              

Structure _SIZEF
  Width.i
  Height.i
EndStructure

; --- Globals for threads (populated before creating threads) ---
Global g_width.i, g_height.i, g_radius.i, g_threads.i
Global *g_buffer, g_temp, g_pitch.i

; ----------------------------
; Thread: Horizontal pass for a chunk of rows
; parameter = threadIndex (0..g_threads-1)
; ----------------------------
Procedure.l HorizontalPassThread(threadIndex.l)
  Protected startRow = (threadIndex * g_height) / g_threads
  Protected endRow = (((threadIndex + 1) * g_height) / g_threads) - 1
  If endRow < startRow : endRow = startRow : EndIf
  
  Protected x, y, i, left, right, count
  Protected c.l
  Protected sumR.i, sumG.i, sumB.i, sumA.i
  Protected srcOffset, tmpOffset
  
  For y = startRow To endRow
    srcOffset = y * g_pitch
    ; initial window sum for x = 0
    sumR = 0 : sumG = 0 : sumB = 0 : sumA = 0
    For i = -g_radius To g_radius
      x = i
      If x < 0 : x = 0 : EndIf
      If x >= g_width : x = g_width - 1 : EndIf
      c = PeekL(*g_buffer + srcOffset + x * 4)
      sumR + (c & $FF)
      sumG + ((c >> 8) & $FF)
      sumB + ((c >> 16) & $FF)
      sumA + ((c >> 24) & $FF)
    Next
    
    For x = 0 To g_width - 1
      count = g_radius * 2 + 1
      tmpOffset = (y * g_width + x) * 4
      PokeL(g_temp + tmpOffset, ((sumA / count) << 24) | ((sumB / count) << 16) | ((sumG / count) << 8) | (sumR / count))
      
      ; slide window: remove left, add right
      left = x - g_radius
      right = x + g_radius + 1
      
      If left < 0 : left = 0 : EndIf
      If right >= g_width : right = g_width - 1 : EndIf
      
      c = PeekL(*g_buffer + srcOffset + left * 4)
      sumR - (c & $FF)
      sumG - ((c >> 8) & $FF)
      sumB - ((c >> 16) & $FF)
      sumA - ((c >> 24) & $FF)
      
      c = PeekL(*g_buffer + srcOffset + right * 4)
      sumR + (c & $FF)
      sumG + ((c >> 8) & $FF)
      sumB + ((c >> 16) & $FF)
      sumA + ((c >> 24) & $FF)
    Next
  Next
  
  ProcedureReturn 0
EndProcedure

; ----------------------------
; Thread: Vertical pass for a chunk of columns
; parameter = threadIndex (0..g_threads-1)
; ----------------------------
Procedure.l VerticalPassThread(threadIndex.l)
  Protected startCol = (threadIndex * g_width) / g_threads
  Protected endCol = (((threadIndex + 1) * g_width) / g_threads) - 1
  If endCol < startCol : endCol = startCol : EndIf
  
  Protected x, y, i, top, bottom, count
  Protected c.l
  Protected sumR.i, sumG.i, sumB.i, sumA.i
  Protected dstOffset, tmpOffset
  
  For x = startCol To endCol
    ; initial window for y = 0
    sumR = 0 : sumG = 0 : sumB = 0 : sumA = 0
    For i = -g_radius To g_radius
      y = i
      If y < 0 : y = 0 : EndIf
      If y >= g_height : y = g_height - 1 : EndIf
      tmpOffset = (y * g_width + x) * 4
      c = PeekL(g_temp + tmpOffset)
      sumR + (c & $FF)
      sumG + ((c >> 8) & $FF)
      sumB + ((c >> 16) & $FF)
      sumA + ((c >> 24) & $FF)
    Next
    
    For y = 0 To g_height - 1
      count = g_radius * 2 + 1
      dstOffset = y * g_pitch + x * 4
      PokeL(*g_buffer + dstOffset, ((sumA / count) << 24) | ((sumB / count) << 16) | ((sumG / count) << 8) | (sumR / count))
      
      top = y - g_radius
      bottom = y + g_radius + 1
      If top < 0 : top = 0 : EndIf
      If bottom >= g_height : bottom = g_height - 1 : EndIf
      
      tmpOffset = (top * g_width + x) * 4
      c = PeekL(g_temp + tmpOffset)
      sumR - (c & $FF)
      sumG - ((c >> 8) & $FF)
      sumB - ((c >> 16) & $FF)
      sumA - ((c >> 24) & $FF)
      
      tmpOffset = (bottom * g_width + x) * 4
      c = PeekL(g_temp + tmpOffset)
      sumR + (c & $FF)
      sumG + ((c >> 8) & $FF)
      sumB + ((c >> 16) & $FF)
      sumA + ((c >> 24) & $FF)
    Next
  Next
  
  ProcedureReturn 0
EndProcedure

; ----------------------------
; Main multi-threaded separable blur
; hImg : image handle (ImageOutput)
; *ImgSize : pointer to _SIZEF (width,height)
; radius : blur radius (integer)
; ----------------------------
Procedure IntegralSeparableBlur_MT(hImg, *ImgSize._SIZEF, radius)
  Protected width = *ImgSize\Width
  Protected height = *ImgSize\Height
  Protected threads = #THREADS
  
  ; set globals
  g_width = width
  g_height = height
  g_radius = radius
  g_threads = threads
  
  ; prepare drawing buffer once (main thread)
  StartDrawing(ImageOutput(hImg))
  *g_buffer = DrawingBuffer()
  g_pitch = DrawingBufferPitch()
  
  ; allocate temp buffer (linear width*height*4)
  g_temp = AllocateMemory(width * height * 4)
  If g_temp = 0
    StopDrawing()
    MessageRequester("Error", "AllocateMemory failed")
    ProcedureReturn
  EndIf
  
  ; --- Horizontal pass: create threads ---
  
  Dim threadIDs.l(threads - 1)
  Protected i
  
  For i = 0 To threads - 1
    threadIDs(i) = CreateThread(@HorizontalPassThread(), i)
  Next
  
  ; wait for all horizontal threads
  For i = 0 To threads - 1
    If threadIDs(i) <> 0
      WaitThread(threadIDs(i))
    EndIf
  Next
  
  ; --- Vertical pass: create threads ---
  For i = 0 To threads - 1
    threadIDs(i) = CreateThread(@VerticalPassThread(), i)
  Next
  
  ; wait for all vertical threads
  For i = 0 To threads - 1
    If threadIDs(i) <> 0
      WaitThread(threadIDs(i))
    EndIf
  Next
  
  FreeMemory(g_temp)
  StopDrawing()
EndProcedure

UsePNGImageDecoder()

Define hImg = LoadImage(#PB_Any, "LargImage.PNG")
If hImg = 0
  MessageRequester("Error", "Load file fail.")
  End
EndIf

Define ImgSize._SIZEF
ImgSize\Width = ImageWidth(hImg)
ImgSize\Height = ImageHeight(hImg)

OpenWindow(0, 0, 0, 800, 600, "Separable Blur MT", #PB_Window_SystemMenu | #PB_Window_ScreenCentered)

IntegralSeparableBlur_MT(hImg, @ImgSize, 200)
ImageGadget(1, 0, 0, ImgSize\Width, ImgSize\Height, ImageID(hImg))

Repeat
  If WaitWindowEvent() = #PB_Event_CloseWindow
    End
  EndIf
ForEver
SMaag
Enthusiast
Enthusiast
Posts: 329
Joined: Sat Jan 14, 2023 6:55 pm
Location: Bavaria/Germany

Re: Multi-threaded Separable Box Blur

Post by SMaag »

Blur is a Matrix operation. The maximum speed you will get with XMM Code.
On modern x64 CPU's this will be extremly fast because of Auto parallelisation, if you use multiple registers.
I guess a 4k Picture needs only ms to process in a single task. Wilbert is the specaialist for such things.
There is anywhere in the forum a demo from wilbert for a color adjust. That demonstrates the amazing speed.
SMaag
Enthusiast
Enthusiast
Posts: 329
Joined: Sat Jan 14, 2023 6:55 pm
Location: Bavaria/Germany

Re: Multi-threaded Separable Box Blur

Post by SMaag »

blure code from wilbert

viewtopic.php?p=543311&hilit=color+adju ... rt#p543311

and here the link to the HueShift discussion with the amazing speed
viewtopic.php?p=607158#p607158
Post Reply