Re: Schnelles CRC32 mit PCLMULQDQ
Verfasst: 08.05.2014 21:43
Sowas könnte mit Unicode zusammenhängen.Sooraa hat geschrieben:ts: die angegeben CRC32 Ergebnisse stimmen nicht überein!?
Das deutsche PureBasic-Forum
https://www.purebasic.fr/german/
Sowas könnte mit Unicode zusammenhängen.Sooraa hat geschrieben:ts: die angegeben CRC32 Ergebnisse stimmen nicht überein!?
Meine Ergebnisse sind bei einem Takt @1,4 GHz gemessen. Dafür sieht es doch gut aus.Sooraa hat geschrieben:Die Ergebnisse von hier auf Sandy Bridge @4.0 GHz:
Die Wette halte ichTommy hat geschrieben:Ich wette um 100 € das der Code von Helle nach ein paar PB Versionen nicht mehr funktioniert wie immer bei den Asm Codes.
Code: Alles auswählen
;CRC32-Fingerprint mit PCLMULQDQ. Verwendet wird Polynomial Reversed ($EDB88320).
;Basierend auf: http://stuff.mit.edu/afs/sipb/contrib/linux/arch/x86/crypto/crc32-pclmul_asm.S
;Lizenz beachten!
;PureBasic 5.22 LTS (Windows - x64)
;Helle 13.05.2014
Global.l CRC32, crc32i, CRC32C
Procedure.l CRC32_CL(Mem, Laenge)
;Test auf PCLMULQDQ
!MOV eax,1
!CPUID
!TEST ecx,2
!JNZ CL_OK ;kann losgehen!
CRC32Fingerprint(Mem, Laenge) ;doch wieder mit PB
ProcedureReturn
!CL_OK:
!MOV [v_CRC32],0 ;falls Laenge < 128
!MOV r9,[p.v_Laenge]
!AND r9,0FFFFFFFFFFFFFF80h
!JZ less_128
!MOV eax,0FFFFFFFFh ;Initialisierungswert für CRC32
!MOVD xmm0,eax
!MOV r8,[p.v_Mem]
!MOVDQA xmm1,[r8]
!MOVDQA xmm2,[r8+10h]
!MOVDQA xmm3,[r8+20h]
!MOVDQA xmm4,[r8+30h]
!PXOR xmm1,xmm0
!SUB r9,40h
!ADD r8,40h
!MOVDQA xmm0,[Lconstant_R2R1]
!loop_64:
!PREFETCHNTA [r8+0c0h] ;Cache "vorfüllen"
!MOVDQA xmm5,xmm1
!MOVDQA xmm6,xmm2
!MOVDQA xmm7,xmm3
!MOVDQA xmm8,xmm4
!PCLMULQDQ xmm1,xmm0,00h
!PCLMULQDQ xmm2,xmm0,00h
!PCLMULQDQ xmm3,xmm0,00h
!PCLMULQDQ xmm4,xmm0,00h
!PCLMULQDQ xmm5,xmm0,11h
!PCLMULQDQ xmm6,xmm0,11h
!PCLMULQDQ xmm7,xmm0,11h
!PCLMULQDQ xmm8,xmm0,11h
!PXOR xmm1,xmm5
!PXOR xmm2,xmm6
!PXOR xmm3,xmm7
!PXOR xmm4,xmm8
!PXOR xmm1,[r8]
!PXOR xmm2,[r8+10h]
!PXOR xmm3,[r8+20h]
!PXOR xmm4,[r8+30h]
!SUB r9,40h
!ADD r8,40h
!CMP r9,40h
!JGE loop_64
!MOVDQA xmm0,[Lconstant_R4R3]
!PREFETCHNTA [r8]
!MOVDQA xmm5,xmm1
!PCLMULQDQ xmm1,xmm0,00h
!PCLMULQDQ xmm5,xmm0,11h
!PXOR xmm1,xmm5
!PXOR xmm1,xmm2
!MOVDQA xmm5,xmm1
!PCLMULQDQ xmm1,xmm0,00h
!PCLMULQDQ xmm5,xmm0,11h
!PXOR xmm1,xmm5
!PXOR xmm1,xmm3
!MOVDQA xmm5,xmm1
!PCLMULQDQ xmm1,xmm0,00h
!PCLMULQDQ xmm5,xmm0,11h
!PXOR xmm1,xmm5
!PXOR xmm1,xmm4
!PCLMULQDQ xmm0,xmm1,01h
!PSRLDQ xmm1,08h
!PXOR xmm1,xmm0
!MOVDQA xmm2,xmm1
!MOVDQA xmm0,[Lconstant_R5]
!MOVDQA xmm3,[Lconstant_mask32]
!PSRLDQ xmm2,04h
!PAND xmm1,xmm3
!PCLMULQDQ xmm1,xmm0,00h
!PXOR xmm1,xmm2
!MOVDQA xmm0,[Lconstant_RUpoly]
!MOVDQA xmm2,xmm1
!PAND xmm1,xmm3
!PCLMULQDQ xmm1,xmm0,10h
!PAND xmm1,xmm3
!PCLMULQDQ xmm1,xmm0,00h
!PXOR xmm1,xmm2
!PEXTRD eax,xmm1,01h
!NOT eax ;wegen Polynomial Reversed
!MOV [v_CRC32],eax
!less_128: ;Rest mit PB (max.127 Bytes)
CRC32Fingerprint(Mem + Laenge - (Laenge & $7F), Laenge & $7F, CRC32) ;könnte auch noch in ASM erledigt werden
ProcedureReturn
;Konstanten
!Align 16
!Lconstant_R2R1:
!dq 0000000154442bd4h
!dq 00000001c6e41596h
!Lconstant_R4R3:
!dq 00000001751997d0h
!dq 00000000ccaa009eh
!Lconstant_R5:
!dq 0000000163cd6124h
!dq 0000000000000000h
!Lconstant_mask32:
!dq 00000000FFFFFFFFh
!dq 0000000000000000h
!Lconstant_RUpoly:
!dq 00000001DB710641h
!dq 00000001F7011641h
EndProcedure
Procedure.l crc1024pcl(Mem, Laenge)
!MOV rdx,0FFFFFFFFFFFFFFFFh ; rdx = crc0 = crc_init
!MOV rax,rdx ;für < 1024
!MOV r9, [p.v_Mem] ; @ is 16 byte aligned, this helps. PREFETCHNTA is no help here
!MOV r10, [p.v_Laenge]
!AND r10,0FFFFFFFFFFFFFC00h
!JZ less_1024
!SHR r10,10 ;Div 1024
!@@:
!XOR r8, r8 ; r8 = crc1
!XOR rax, rax ; rax = crc2
!CRC32 rdx, qword[r9] ; do first 8 bytes in rdx here for better pipelining
; this is a non-pipelined single instruction: latency 3, througput 1 cycle
!i = 8 ; no CPU-instruction
!Repeat BLOCKSIZE/8 ; no CPU-instruction, 42 repeats * 8 bytes = 336 bytes
!CRC32 r8, qword[r9 + i + 1*BLOCKSIZE] ; crc1
!CRC32 rax, qword[r9 + i + 2*BLOCKSIZE] ; crc2
!CRC32 rdx, qword[r9 + i + 0*BLOCKSIZE] ; crc0, process rdx last to avoid dependency with crc32 above
!i = i + 8 ; no CPU-instruction
!End Repeat ; no CPU-instruction, 336 * 3 bytes = 1.008 + first 8 = 1.016 bytes
!MOVDQA xmm0, dqword[K344_1] ; 2 constans: K1:K2
; now merge crc0 and crc1 into crc2
!MOVQ xmm1, r8 ; crc1
!PCLMULQDQ xmm1, xmm0, 0x10 ; multiply by K1, high qw to low qw in xmm1
!MOVQ xmm2, rdx ; crc0
!PCLMULQDQ xmm2, xmm0, 0x00 ; multiply by K2, low qw to low qw in xmm2
!CRC32 rax, qword[r9 + 1016] ; last 8 bytes, 1.024 bytes processed
!XOR r11, r11
!MOVQ r12, xmm1
!CRC32 r11, r12
!XOR rax, r11
!XOR r11, r11
!MOVQ r12, xmm2
!CRC32 r11, r12
!XOR rax, r11
!ADD r9,1024
!MOV rdx,rax
!DEC r10
!JNZ @b
!less_1024:
!MOV r10, [p.v_Laenge]
!AND r10,03FFh
!@@:
!CRC32 rax,byte[r9]
!INC r9
!DEC r10
!JNZ @b
!NOT eax
!MOV [v_crc32i], eax ; output result
ProcedureReturn
;Konstanten
!Align 16
!K344_1: dq 0x0e417f38a
!K344_2: dq 0x08f158014
!BLOCKSIZE = 336
EndProcedure
;Test Helle
A$ = "The quick brown fox jumps over the lazy dog."
LA = Len(A$)
Faktor = 9999999 ; = 10 Mio.
Buffer = AllocateMemory(LA * (Faktor + 1) + 16)
If Buffer
BufferA = Buffer
If BufferA & $0F ; muss Alignment 16 sein!
BufferA = Buffer + 16 - (BufferA & $0F)
EndIf
For i = 0 To Faktor
PokeS(BufferA + (i * LA), A$)
Next
Length = LA * (Faktor + 1)
Time_PB_A = ElapsedMilliseconds()
CRC32 = CRC32Fingerprint(BufferA, Length) ; Achtung, bei > 4GB splitten!
Time_PB_E = ElapsedMilliseconds() - Time_PB_A
PB$ = "CRC32_PB = " + Hex(CRC32 & $FFFFFFFF) + " in " + Str(Time_PB_E) + " ms"
Time_CL_A = ElapsedMilliseconds()
CRC32 = CRC32_CL(BufferA, Length)
Time_CL_E = ElapsedMilliseconds() - Time_CL_A
;Test Sooraa
; - crc32i extension by Sooraa, 05.05.2014; PureBasic 5.22 LTS (Windows - x64). SSE4.2 required. Feel free To use my code without restriction.
; - Helle created an high-speed design for PB and other polynomina with a factor of 1 to 6 against PB
; - With the extension of my ASM Macro "crc1024pcl" there is another speed factor of 1 to 9 against Helle's approach (approx. 1 to 55 ag. PB)
; - Helle mentioned 30 ms on his CPU. This will be a Haswell CPU, since with 18 * PCLMULQDQ Haswell's improved CRC32-latency shines through
; - But "non-Haswell riders" can use the native CRC32i machine instruction in a heavy pipelined fashion (crc32 latencies are close to zero)
; - My design is converted from the Intel White Paper 323405, "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
; - "Function to compute iSCSI CRC32 with PCLMULQDQ Instruction", by almost completely eliminating the crc32i-latencies by 3-stage pipelining
; - crc done "by 3" for fixed input size of 1.024 bytes, = 336 * 3 * 8 + 16, (BLOCKSIZE = 336). For insight purposes the code is commented.
; - The conversionrate for i5 (Westmere, 3.6 GHz): 19 ms for 440 MB = ca. 23,2 GB/Sek. = 0,155 cycle per byte like Intels performance test
; - The conversionrate for i7 (Sandy Bridge, 4.0 GHz): 15 ms for 440 MB = ca. 29,3 GB/Sek. = 0,136 cycle per byte
; - Plus the code is extremely small (<27 instructions with approx. 140 bytes) and the datasegment is tableless and just 8 bytes.
; - That means that since Sandy Bridge the code can even stay preprocessed in RISC code at the "L0 cache" with really no clock waisted.
; - So we achieved the maximum throughput and pressure "at the execution unit" under PB. Dont't forget the cache-warmup during tests.
; - Since we are that "close to the transistor", it would be interesting to see comparisons to state of the art AMD-CPUs.
;MS = MemorySize(BufferA) / 1024 ; 429.687
T1 = ElapsedMilliseconds()
;For I = 1 To MS ; let´s keep the testloop counter outside the ASM Macro
crc1024pcl(BufferA, Length) ; polynom 0x11EDC6F41
;Next I
T2 = ElapsedMilliseconds() - T1
;hier CRC32C Byte-by-Byte, also langsam, aber wegen Ergebnis-Test
T3 = ElapsedMilliseconds()
!MOV rdx,[v_BufferA]
!MOV rcx,[v_Length]
!MOV rax,0FFFFFFFFFFFFFFFFh
!@@:
!CRC32 rax,byte[rdx]
!INC rdx
!DEC rcx
!JNZ @b
!NOT eax
!MOV [v_CRC32C],eax
T4 = ElapsedMilliseconds() - T3
CL$ = "CRC32_CL = " + Hex(CRC32 & $FFFFFFFF) + " in " + Str(Time_CL_E) + " ms"
AL$ = "CRC32iPCL= " + Hex(crc32i & $FFFFFFFF) + " in " + Str(T2) + " ms"
CC$ = "CRC32C = " + Hex(CRC32C & $FFFFFFFF) + " in " + Str(T4) + " ms"
FreeMemory(Buffer)
;SetClipboardText("CRC32 für " + Str(Length) + " Bytes:" + #LF$ + PB$ + #LF$ + CL$ + #LF$ + AL$ + #LF$ + CC$)
MessageRequester("CRC32 für " + Str(Length) + " Bytes", PB$ + #LFCR$ + CL$ + #LFCR$ + AL$ + #LFCR$ + CC$)
EndIf
End
CRC32 für 440000000 Bytes:
CRC32_PB = 5FCF543F in 1802 ms
CRC32_CL = 5FCF543F in 140 ms
CRC32iPCL= 8A39EB97 in 36 ms
CRC32C = 8A39EB97 in 380 ms