For this short code is multi-threading nonsense. But take a look at this (an old test from me, not perfect
Code: Select all
;- Apfelmännchen-Test mit AVX 256-Bit, Intel-CPU
;- "Helle" Klaus Helbing, 30.03.2013, PB v.5.10 (x64)
Global Freq.q ;für Zeitmessung
Global Start.q
Global Ende.q
Global Zeit.d
Global ZeitG.d
Global Zoom.d = 1.0
Global AchseX.d = 1.25
Global AchseY.d = -2.0
Global ScreenX.q = 1024
Global ScreenY.q = 1023 ;wegen Y-Koordinate = 0, Optik für Grundbild
Global Breite.d = 2.5
Global Hoehe.d = 2.5
Global XScale.d = Breite / ScreenX
Global YScale.d = -Hoehe / ScreenY
Global X1.d = XScale / Zoom
Global Y1.d = YScale / Zoom
Global MX.d
Global MY.d
Global VersatzX.d = 0.5 ;Versatz für Zentrierung
Global VersatzY.d = 0.0
Global Vier.d = 4.0
Global MaxIter.q = 255
Global Zoom_Max.q = 500000000000
Global IterCount.q
Global IterColor1.q
Global IterColor2.q
Global IterColor3.q
Global IterColor4.q
Global XStart.d = -1.4563337
Global YStart.d = 0.0
Global RohBuffer.q
Global ZeichenBuffer.q
Global ColorBuffer.q
Global X.d
Global XH1.d = 0.0
Global XH2.d = 256.0
Global XH3.d = 512.0
Global XH4.d = 768.0
Global Z1.d
Global Z2.d
Global D3.d = 3.0
Global D17.d = 17.0
Global D21.d = 21.0
Global D53.d = 53.0
Global D127.d = 127.0
Global D128.d = 128.0
#ScreenHeight = 1024.0
#ScreenWidth = 1024.0
!jmp OverASM
!SinAVX:
;this part is for reduce to 0-Pi/2 and set signum
!vmovsd xmm2,qword[Pi_Half] ;load XMM2 with Pi_Half (1.570796326794896619)
!vdivsd xmm0,xmm0,xmm2 ;divide XMM0 (x=radiant) by XMM2 (Pi_Half), result in XMM0
!vcvttsd2si rax,xmm0 ;convert the result (float double precision) to integer with truncation (like Int(x) in PB)
!vcvtsi2sd xmm1,xmm1,rax ;convert the integer in RAX to float double precision in XMM1
!vsubsd xmm0,xmm0,xmm1 ;subtract XMM1 from XMM0, result in XMM0
!vmulsd xmm0,xmm0,xmm2 ;multiply XMM0 with XMM2 (Pi_Half), result in XMM0
!test rax,1 ;test for quadrant 2 and 4 (6,8...)
!jz @f ;no
!vaddsd xmm0,xmm0,xmm2 ;add Pi_Half (XMM2) to XMM0, result in XMM0
!@@:
!test rax,2 ;test for quadrant 3 and 4 (and all negatives)
!jz @f ;no
!vxorpd xmm0,xmm0,[Minus] ;change (set) bit 63 (signum)
!@@:
;calculate the first 4 terms (without start-value x), 1=0-63, 2=64-127, 3=128-191, 4=192-255
!vmovddup xmm0,xmm0 ;XMM0: 1=x^1 2=x^1 duplicate bits 0-63 in bits 64-127
!vinsertf128 ymm1,ymm0,xmm0,1b ;YMM1: 1=x^1 2=x^1 3=x^1 4=x^1 YMM1(0-127)=XMM0, YMM1(128-255)=XMM0
!vmulpd ymm2,ymm1,ymm1 ;YMM2: 1=x^2 2=x^2 3=x^2 4=x^2
!vmulsd xmm3,xmm2,xmm2 ;YMM3: 1=x^4 2=x^2 3=x^0 4=x^0
!vmulpd ymm4,ymm2,ymm2 ;YMM4: 1=x^4 2=x^4 3=x^4 4=x^4
!vmulpd ymm2,ymm4,ymm4 ;YMM2: 1=x^8 2=x^8 3=x^8 4=x^8 for the next 4 terms
!vmulpd ymm3,ymm3,ymm1 ;YMM3: 1=x^5 2=x^3 3=x^0 4=x^0
!vmulpd ymm1,ymm3,ymm4 ;YMM1: 1=x^9 2=x^7 3=x^0 4=x^0
!vperm2f128 ymm5,ymm1,ymm3,100000b ;YMM5: 1=x^9 2=x^7 3=x^5 4=x^3 YMM5(0-127)=YMM1(0-127), YMM5(128-255)=YMM3(0-127)
!vmulpd ymm1,ymm5,yword[RezFak] ;multiply the 4 values in YMM5 with RezFak -1/3! ... 1/9!, result in YMM1
;next 4 terms, without loop
!vmulpd ymm5,ymm5,ymm2 ;YMM5: 1=x^17 2=x^15 3=x^13 4=x^11
!vmulpd ymm3,ymm5,yword[RezFak+32]
!vaddpd ymm1,ymm3,ymm1
;next 4 terms
!vmulpd ymm5,ymm5,ymm2 ;YMM5: 1=x^25 2=x^23 3=x^21 4=x^19
!vmulpd ymm3,ymm5,yword[RezFak+64]
!vaddpd ymm1,ymm3,ymm1
!vhaddpd ymm2,ymm1,ymm1 ;YMM2: 1=1+2 of YMM1, 3=3+4 of YMM1
!vextractf128 xmm1,ymm2,1b ;XMM1: 3+4 of YMM2
!vaddsd xmm3,xmm2,xmm1 ;XMM3: 1=sum of iterations
!vaddsd xmm0,xmm0,xmm3 ;XMM0: 1=sum of iterations plus start-value (1.term=x)
!vzeroupper ;set YMM0H-YMM15H to zero
!ret
!Minus:
!dq 8000000000000000h ;for change (set) bit 63 (signum)
!Pi_Half:
!dq 1.570796326794896619
!RezFak:
!dq 2.755731922398589065e-6 ; 1/9! 4.Iteration
!dq -1.984126984126984127e-4 ;-1/7! 3.Iteration
!dq 8.333333333333333333e-3 ; 1/5! 2.Iteration
!dq -1.666666666666666667e-1 ;-1/3! 1.Iteration
!dq 2.811457254345520763e-15 ; 1/17! 8.Iteration
!dq -7.647163731819816476e-13 ;-1/15! 7.Iteration
!dq 1.605904383682161460e-10 ; 1/13! 6.Iteration
!dq -2.505210838544171878e-8 ;-1/11! 5.Iteration
!dq 6.446950284384473396e-26 ; 1/25! 12.Iteration
!dq -3.868170170630684038e-23 ;-1/23! 11.Iteration
!dq 1.957294106339126123e-20 ; 1/21! 10.Iteration
!dq -8.220635246624329717e-18 ;-1/19! 9.Iteration
Procedure Colors()
ColorBuffer = AllocateMemory(4 * MaxIter) ;2
!MOV rdx,[v_ColorBuffer]
!MOV rcx,[v_MaxIter]
!XOR r8,r8
!XOR r15,r15 ;sicher ist sicher
!CVTSI2SD xmm6,[v_MaxIter]
!@@:
!CVTSI2SD xmm14,r8 ;convert the integer in R8 to float double precision in XMM7
!DIVSD xmm14,xmm6
;- R
!MOVSD xmm0,xmm14
!MULSD xmm0,[v_D21]
!ADDSD xmm0,[v_D3]
!call SinAVX
!MULSD xmm0,[v_D127]
!ADDSD xmm0,[v_D128]
!CVTtSD2SI r15,xmm0 ;convert the result (float double precision) to integer with truncation (like Int(x) in PB)
!SHL r15,8
;- G
!movsd xmm0,xmm14
!mulsd xmm0,[v_D17]
!call SinAVX
!mulsd xmm0,[v_D127]
!addsd xmm0,[v_D128]
!cvttsd2si rax,xmm0 ;convert the result (float double precision) to integer with truncation (like Int(x) in PB)
!mov r15b,al
!shl r15,8
;- B
!movsd xmm0,xmm14
!mulsd xmm0,[v_D53]
!call SinAVX
!mulsd xmm0,[v_D127]
!addsd xmm0,[v_D128]
!cvttsd2si rax,xmm0 ;convert the result (float double precision) to integer with truncation (like Int(x) in PB)
!mov r15b,al
;- Farbwert (32-Bit) für 1 Pixel in Grafikbuffer schreiben
!MOV [rdx],r15d ;0-7:B, 8-15:G, 16-23:R
!ADD rdx,4
!ADD r8,1
!SUB rcx,1
!JNZ @b
EndProcedure
Macro Thread(Buffer_Part, Start_Y, End_Y)
;Iterations-Schleife
;For Y = 0 To ScreenY - 1
!XOR r13,r13 ;Zähler Iterationen
!MOV rcx,qword[v_RohBuffer] ;Pointer in RohBuffer
!ADD rcx,Buffer_Part
!MOV r8,Start_Y ;Zähler äussere Schleife (Y-Koordinate)
!MULSD xmm0,qword[v_Y1]
!ADDSD xmm0,qword[v_AchseX]
!MOVQ qword[v_X],xmm0
!VBROADCASTSD ymm2,qword[v_X] ;ymm2=4*cY
!VBROADCASTSD ymm10,qword[v_Y1]
!VBROADCASTSD ymm11,qword[v_X1]
!VBROADCASTSD ymm14,qword[v_Vier]
!.Y_Loop: ;local label
!VADDPD ymm2,ymm2,ymm10 ;ymm2=4*cY
;For X = 0 To ScreenX - 1
!XOR r9,r9 ;Zähler innere Schleife (X-Koordinate)
!VMOVDDUP xmm1,qword[v_AchseY] ;cX in ymm1=0-0-1-1
!ADDSD xmm1,qword[v_X1] ;cX in ymm1=0-0-1-2
!VPERM2F128 ymm3,ymm1,ymm1,0 ;untere 128 Bit nach oben kopieren, cX in ymm3=1-2-1-2
!ADDSD xmm3,qword[v_X1] ;cX in ymm3=1-2-1-3
!VSHUFPD ymm1,ymm3,ymm3,00001000b ;beide unteren Doubles gleich, cX in ymm1=1-2-3-3
!ADDSD xmm1,qword[v_X1] ;ymm1=1-2-3-4
!.X_Loop:
!VADDPD ymm1,ymm1,ymm11 ;ymm1=4*cX
!VMOVAPD ymm3,ymm2 ;ymm3=zy*zy
!VMOVAPD ymm4,ymm1 ;ymm4=zx*zx
!XOR rax,rax ;IterCounter=0
!MOV r10,00001111b ;Bit 0 bis 3 als Merker fertig
!XOR r11,r11 ;enthält die vier Iterationstiefen (4 Words)
!MOV qword[r12],0
!.I_Loop:
;While (zx * zx + zy * zy) < 4 And (IterCounter < MaxIter)
;zx * zx
!VMULPD ymm5,ymm4,ymm4 ;ymm5=zx*zx
;zy * zy
!VMULPD ymm6,ymm3,ymm3 ;ymm6=zy*zy
;(zx * zx + zy * zy) < 4 ?
!VADDPD ymm0,ymm5,ymm6
!VCMPNLTPD ymm0,ymm0,ymm14 ;ymm14 < ymm0 ? ymm14=4*4.0
!VMOVMSKPD edx,ymm0 ;Bit1=Pixel Bit0=Pixel+1
!AND edx,r10d
!JZ .AlleK ;alle kleiner
!CMP edx,00001111b
!JNE @f
!VMOVQ xmm0,rax
!VPSHUFLW xmm0,xmm0,0
!VMOVQ r11,xmm0 ;R11 jetzt 4x Iterationstiefe (jeweils als Word)
!JMP .Farbgebung ;alle drüber
!@@:
!TEST edx,00000001b
!JZ @f
!MOV [r12+6],ax
!@@: ;ist Bit1 gesetzt?
!TEST edx,00000010b
!JZ @f
!MOV [r12+4],ax
!@@: ;ist Bit2 gesetzt?
!TEST edx,00000100b
!JZ @f
!MOV [r12+2],ax
!@@: ;ist Bit3 gesetzt?
!TEST edx,00001000b
!JZ .AlleK
!MOV [r12+0],ax
!.AlleK:
!MOV r11,[r12]
!NOT edx
!AND r10d,edx
!AND r10d,00001111b
!JZ .Farbgebung ;das war´s schon
;zy = 2 * zx * zy + cY
!VMULPD ymm3,ymm3,ymm4
!VADDPD ymm3,ymm3,ymm3
!VADDPD ymm3,ymm3,ymm2 ;ymm2=cY ymm3=zy
;zx = zx * zx - zy * zy + cX
!VSUBPD ymm4,ymm5,ymm6
!VADDPD ymm4,ymm4,ymm1 ;ymm1=cX ymm4=zx
!INC rax ;IterCounter
!CMP rax,255;[v_MaxIter]
!JB .I_Loop
;Wend
!CMP r10,00001111b
!JE .AlleX
!TEST r10,00000001b
!JZ @f
!MOV [r12+6],ax
!@@: ;ist Bit1 gesetzt?
!TEST r10,00000010b
!JZ @f
!MOV [r12+4],ax
!@@: ;ist Bit2 gesetzt?
!TEST r10,00000100b
!JZ @f
!MOV [r12+2],ax
!@@: ;ist Bit3 gesetzt?
!MOV r11,[r12]
!TEST r10,00001000b
!JZ .Farbgebung
!ADD r11,rax
!JMP .Farbgebung
!.AlleX:
!VMOVQ xmm0,rax
!VPSHUFLW xmm0,xmm0,0
!VMOVQ r11,xmm0 ;R11 jetzt 4x jeweilige Iterationstiefe (jeweils als Word)
!.Farbgebung:
!MOV [rcx],r11
!ADD rcx,8 ;4 Punkte a 2 Byte
!ADD r9,4
!VADDPD ymm1,ymm1,ymm11
!VADDPD ymm1,ymm1,ymm11
!VADDPD ymm1,ymm1,ymm11
;innere Schleife X
;Next
!CMP r9,1024 ;ScreenX
!JB .X_Loop
;äussere Schleife Y
;Next
!INC r8
!CMP r8,End_Y ;ScreenY
!JB .Y_Loop
!ADD [v_IterCount],r13
!VZEROUPPER ;High von YMMx auf Null
EndMacro
Procedure ThreadIter1(Dummy)
SetThreadAffinityMask_(GetCurrentThread_(), 1) ;auskommentieren für keine Core-Zuweisung, 1=Core0
!LEA r12,[v_IterColor1]
!MOVQ xmm0,qword[v_XH1] ;einzeln nicht nötig
Thread(0, 0, 256) ;0=Beginn des Viertels im Grafik-Buffer, 0=Start Y-Koordinate, 256=Ende Y-Koordinate. Parameter anpassen für Anzahl Threads! For 1 Core: Thread(0, 0, 1024)
EndProcedure
Procedure ThreadIter2(Dummy)
SetThreadAffinityMask_(GetCurrentThread_(), 2) ;Core1
!LEA r12,[v_IterColor2]
!MOVQ xmm0,qword[v_XH2]
Thread(524288, 256, 512) ;524288=1024*2*256=Beginn des Viertels im Grafik-Buffer, 256=Start Y-Koordinate, 512=Ende Y-Koordinate
EndProcedure
Procedure ThreadIter3(Dummy)
SetThreadAffinityMask_(GetCurrentThread_(), 4) ;Core2
!LEA r12,[v_IterColor3]
!MOVQ xmm0,qword[v_XH3]
Thread(1048576, 512, 768) ;1048576=1024*2*512=Beginn des Viertels im Grafik-Buffer, 512=Start Y-Koordinate, 768=Ende Y-Koordinate
EndProcedure
Procedure ThreadIter4(Dummy)
SetThreadAffinityMask_(GetCurrentThread_(), 8) ;Core3
!LEA r12,[v_IterColor4]
!MOVQ xmm0,qword[v_XH4]
Thread(1572864, 768, 1024) ;1572864=1024*2*768=Beginn des Viertels im Grafik-Buffer, 768=Start Y-Koordinate, 1024=Ende Y-Koordinate
EndProcedure
Procedure Frac()
SetGadgetText(1, "Busy")
TEXT2$ = "Zoom = " + StrD(Zoom, 2)
SetGadgetText(2, TEXT2$)
X1 = XScale / Zoom
AchseY = (-0.5 / Zoom) + XStart ;- VersatzX
Y1 = YScale / Zoom
AchseX = (1.25 / Zoom) + YStart ;VersatzY
StartDrawing(ScreenOutput())
ZeichenBuffer = DrawingBuffer()
IterCount = 0
QueryPerformanceCounter_(@Start)
;Anzahl Threads hier festlegen
Thread1 = CreateThread(@ThreadIter1(), 0)
Thread2 = CreateThread(@ThreadIter2(), 0) ;remark for 1 Core
Thread3 = CreateThread(@ThreadIter3(), 0) ;remark for 1 Core
Thread4 = CreateThread(@ThreadIter4(), 0) ;remark for 1 Core
;Analog
WaitThread(Thread1)
WaitThread(Thread2) ;remark for 1 Core
WaitThread(Thread3) ;remark for 1 Core
WaitThread(Thread4) ;remark for 1 Core
QueryPerformanceCounter_(@Ende)
QueryPerformanceFrequency_(@Freq)
Zeit = (Ende - Start) / Freq
ZeitG + Zeit
!MOV rcx,1024*1024 ;Anzahl Bildpunkte
!MOV rdx,[v_ZeichenBuffer]
!MOV r9,[v_RohBuffer]
!MOV r11,[v_ColorBuffer]
!@@:
;- Iterationstiefen für 4 Punkte auslesen
!MOV r10,[r9]
!MOV r13,4
!LL:
!MOV r8,r10
!SHR r10,16
!AND r8,0FFFFh ;jede Iterationstiefe ist ein Word
!ADD [v_IterCount],r8 ;nur für Anzeige
!SHL r8,2
!MOV r15d,[r11+r8]
;- Farbwert (32-Bit) für 1 Pixel in Grafikbuffer schreiben
!MOV [rdx],r15d ;0-7:B, 8-15:G, 16-23:R
!ADD rdx,4
!SUB r13,1
!JNZ LL
;- 4 Punkte auf einmal
!ADD r9,8
!SUB rcx,4
!JNZ @b
StopDrawing()
FlipBuffers()
SetGadgetText(1, "Ready")
SetGadgetText(7, "Zeit / Frame = " + StrD(Zeit) + " s")
Z1 = IterCount / Zeit / 1000000
SetGadgetText(6, "Iterationen = " + Str(IterCount) + " (" + StrD(Z1, 1) + " Mio/s)")
Z2 = 1 / Zeit
SetGadgetText(8, "Frames = " + StrD(Z2, 1) + " /s")
EndProcedure
!OverASM:
If OpenWindow(0, 0, 0, 1300, 1024, "Apfelmännchen-Test-AVX-4D-4C", #PB_Window_MinimizeGadget | #PB_Window_MaximizeGadget) = 0 Or InitSprite() = 0 ;InitSprite() für OpenWindowedScreen()
MessageRequester("Fehler!", "Hier stimmt was nicht!")
End
EndIf
TextGadget(1, 1030, 20, 45, 20, "")
TextGadget(2, 1030, 50, 250, 20, "")
TextGadget(3, 1030, 80, 150, 20, "Max. Iterationstiefe = " + Str(MaxIter))
TextGadget(4, 1030, 110, 250, 20, "")
TextGadget(5, 1030, 140, 250, 20, "")
TextGadget(6, 1030, 170, 250, 20, "")
TextGadget(7, 1030, 200, 200, 20, "")
TextGadget(8, 1030, 230, 260, 20, "")
TextGadget(9, 1030, 260, 260, 20, "")
FontHigh = Int(9.0 / (GetDeviceCaps_(GetDC_(WindowID(0)), #LOGPIXELSY) / 96.0)) ;Font anpassen
LoadFont(0, "Arial", FontHigh)
For i = 1 To 9 ;8
SetGadgetFont(i, FontID(0))
Next
If OpenWindowedScreen(WindowID(0), 0, 0, #ScreenWidth, #ScreenHeight, 0, 0, 0);, #PB_Screen_NoSynchronization)
Colors()
RohBuffer = AllocateMemory(1024 * 1024 * 2)
While Zoom < Zoom_Max
WindowEvent()
Zoom = (Zoom * 1.04)
Frac()
Wend
While Zoom >= 1.00
WindowEvent()
Zoom = (Zoom / 1.04)
Frac()
Wend
SetGadgetText(9, "Total Time = " + StrD(ZeitG) + " s")
Repeat
Delay(1)
Until WindowEvent() = #PB_Event_CloseWindow
EndIf
For change the number of cores see the remarks in line 316 and 360-367.