SHA256 with CPU-SHA-Instruction-Set

Bare metal programming in PureBasic, for experienced users
Helle
Enthusiast
Enthusiast
Posts: 178
Joined: Wed Apr 12, 2006 7:59 pm
Location: Germany
Contact:

SHA256 with CPU-SHA-Instruction-Set

Post by Helle »

For this I found
https://groups.google.com/d/topic/fa.li ... UelRfl5hx4
This is a translation and SHA256-Test (SHA1 is out) with PureBasic:

Code: Select all

;SHA256-CPU-Instructions for Windows 64-Bit and Unicode-Test-String
;Tested with PB 5.61 (x64) Unicode and PB 5.45 LTS (x64) Unicode, CPU AMD Ryzen 7 1800X
;"Helle" Klaus Helbing, 04.12.2017
;Based on https://groups.google.com/d/topic/fa.linux.kernel/jUelRfl5hx4
;From this:
;This file is provided under a dual BSD/GPLv2 license.  When using or 
;redistributing this file, you may do so under either license. 
;
;GPL LICENSE SUMMARY 
;
;Copyright(c) 2015 Intel Corporation. 
;
;This program is free software; you can redistribute it and/or modify 
;it under the terms of version 2 of the GNU General Public License as 
;published by the Free Software Foundation. 
;
;This program is distributed in the hope that it will be useful, but 
;WITHOUT ANY WARRANTY; without even the implied warranty of 
;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
;General Public License for more details. 
;
;BSD LICENSE 
;
;Copyright(c) 2015 Intel Corporation. 
;
;Redistribution and use in source and binary forms, with or without 
;modification, are permitted provided that the following conditions 
;are met: 
;
;         * Redistributions of source code must retain the above copyright 
;           notice, this list of conditions and the following disclaimer. 
;         * Redistributions in binary form must reproduce the above copyright 
;           notice, this list of conditions and the following disclaimer in 
;           the documentation and/or other materials provided with the 
;           distribution. 
;         * Neither the name of Intel Corporation nor the names of its 
;           contributors may be used to endorse or promote products derived 
;           from this software without specific prior written permission. 
;
;THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
;"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
;LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
;A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
;OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
;SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
;LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
;DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
;THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
;(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
;OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
;------------------------------------------------------------------------------

;Check for CPU-SHA-Instructions:
!mov eax,7
!xor ecx,ecx
!cpuid
!test ebx,20000000h          ;Bit29 SHA
!jnz IsSHA
MessageRequester("Ooops!", "No CPU-Support for SHA-Instructions!" + #CRLF$ + "End")
End

!IsSHA:
;Check for Unicode

Declare.s CPU_SHA256(pSource.q)

;Generate a long Test-String, Unicode for this test
Source$ = "The quick brown fox jumps over the lazy dog"    ;Or your own 
For i = 1 To 23
  Source$ + Source$
Next

;Test with CPU-Instructions
TA_CPU = ElapsedMilliseconds()
  Res_CPU$ = CPU_SHA256(@Source$)                ;Pointer to string
TE_CPU = ElapsedMilliseconds() - TA_CPU

;Test with PB
UseSHA2Fingerprint()
TA_PB = ElapsedMilliseconds()
  Res_PB$ = UCase(StringFingerprint(Source$, #PB_Cipher_SHA2, 256))
TE_PB = ElapsedMilliseconds() - TA_PB

Display$ = "CPU: " + Res_CPU$ + "  Time: " + Str(TE_CPU) + "ms" + #LFCR$ + "PB:    " + Res_PB$ + "  Time: " + Str(TE_PB) + "ms"
;SetClipboardText(Display$)
;CPU: 46FFC4A3DA4F60B940A0058764689FE79863F1C77C1A11230F1CF740666C5748  Time: 238ms
;PB:  46FFC4A3DA4F60B940A0058764689FE79863F1C77C1A11230F1CF740666C5748  Time: 2281ms
MessageRequester("SHA256-CPU-Test Unicode", Display$) 
End

Procedure.s CPU_SHA256(pSource.q)
;Save registers in programs
  !mov r15,[p.v_pSource]     ;Pointer to string
  !lea rax,[Varis]           ;For variables etc.

  ;Set Align 16 for variables
  !mov rdx,rax
  !and rdx,0fh
  !add rax,16
  !sub rax,rdx

  !mov r8,rax                ;Last_Chunk
  !mov r10,r8
  !add r10,256               ;Chunks
  !mov r11,r10
  !add r11,8                 ;Chunk_Rest
  !mov r12,r11
  !add r12,8                 ;Size

  ;Len(String)
  !mov rdx,r15
  !mov rax, -16
  !pxor xmm1,xmm1  
 !@@:
  !add rax,16
  !pcmpistri xmm1,dqword[rdx+rax],00001001b      ;Unicode  Bit0=1 and Bit1=0 -> String-Chars are unsigned Words, Bit2=0 and Bit3=1 -> Test for equal each
 !jnz @b
  !shr rax,1                                     ;Unicode
  !add rax,rcx
  !mov [r12],rax

  ;Chunks = ((Size + 8) / 64) + 1
  !mov rdx,rax
  !add rdx,8
  !shr rdx,6
  !add rdx,1
  !mov [r10],rdx             ;Chunks

  ;Chunk_Rest = Size %64
  !mov rcx,64
  !xor rdx,rdx
  !div rcx
  !mov [r11],rdx

  !mov rax,[r12]             ;Size
  !shl rax,3                 ;Bits Size
  !bswap rax                 ;To Big Endian
  !movq xmm0,rax             ;Unicode
  !pxor xmm1,xmm1
  !punpcklbw xmm0,xmm1       ;"Blow-Up" to Unicode

  !cmp qword[r10],1
 !je .Only_1Chunk

  ;Copy String-Part
  !mov rdi,r8
  !mov rax,r15
  !cmp qword[r12],64
 !jbe @f
  !mov rdx,[r10]
  !sub rdx,2
  !shl rdx,7
  !add rax,rdx
 !@@:
  !cld
  !mov rsi,rax
  !mov rcx,[r11]
  !cmp rcx,56
 !jae @f
  !add rcx,64
 !@@:
  !rep movsw

  !mov rax,r8 
  !movdqu [rax+120*2],xmm0   ;Unicode

 !jmp @f
 !.Only_1Chunk:
  ;Copy String
  !mov rdi,r8
  !mov rsi,r15
  !mov rcx,[r11]
  !rep movsw
  !mov rax,r8

  !movdqu [rax+56*2],xmm0    ;Unicode
 !@@:
  !mov word[rdi],80h         ;Set Bit Unicode

  !cmp qword[r10],3          ;Chunks
 !jae @f
  !mov r15,r8                ;pSource=Last_Chunk

 !@@:
  ;Start_Values
  !movdqu xmm1,dqword[STATE0]
  !movdqu xmm2,dqword[STATE1]

  !pshufd xmm1,xmm1,0b1h     ;CDAB
  !pshufd xmm2,xmm2,1bh      ;EFGH
  !movdqa xmm7,xmm1
  !palignr xmm1,xmm2,8       ;ABEF
  !pblendw xmm2,xmm7,0f0h    ;CDGH

  !movdqa xmm8,dqword[PSHUFFLE_BYTE_FLIP_MASK]
  !lea rax,[K256]

  !mov rsi,r15
  !mov rcx,[r10]

 !.Lloop0:
  ;Save hash values for addition after rounds
  !movdqa xmm9,xmm1          ;Save ABEF
  !movdqa xmm10,xmm2         ;Save CDGH
  ;Rounds 0-3
  !movdqu xmm0,[rsi]         ;String is Unicode, we need ASCII
  !movdqu xmm11,[rsi+16]
  !packuswb xmm0,xmm11       ;ASCII

  !pshufb xmm0,xmm8
  !movdqa xmm3,xmm0
  !paddd xmm0,[rax]
  !sha256rnds2 xmm2,xmm1
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  ;Rounds 4-7
  !movdqu xmm0,[rsi+32]      ;Unicode
  !movdqu xmm11,[rsi+48]
  !packuswb xmm0,xmm11

  !pshufb xmm0,xmm8
  !movdqa xmm4,xmm0
  !paddd xmm0,[rax+16]
  !sha256rnds2 xmm2,xmm1
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm3,xmm4
  ;Rounds 8-11
  !movdqu xmm0,[rsi+64]      ;Unicode
  !movdqu xmm11,[rsi+80]
  !packuswb xmm0,xmm11

  !pshufb xmm0,xmm8
  !movdqa xmm5,xmm0
  !paddd xmm0,[rax+32]
  !sha256rnds2 xmm2,xmm1
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm4,xmm5
  ;Rounds 12-15
  !movdqu xmm0,[rsi+96]      ;Unicode
  !movdqu xmm11,[rsi+112]
  !packuswb xmm0,xmm11

  !pshufb xmm0,xmm8
  !movdqa xmm6,xmm0
  !paddd xmm0,[rax+48]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm6
  !palignr xmm7,xmm5,4
  !paddd xmm3,xmm7
  !sha256msg2 xmm3,xmm6
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm5,xmm6
  ;Rounds 16-19
  !movdqa xmm0,xmm3
  !paddd xmm0,[rax+64]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm3
  !palignr xmm7,xmm6,4
  !paddd xmm4,xmm7
  !sha256msg2 xmm4,xmm3
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm6,xmm3
  ;Rounds 20-23
  !movdqa xmm0,xmm4
  !paddd xmm0,[rax+80]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm4
  !palignr xmm7,xmm3,4
  !paddd xmm5,xmm7
  !sha256msg2 xmm5,xmm4
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm3,xmm4
  ;Rounds 24-27
  !movdqa xmm0,xmm5
  !paddd xmm0,[rax+96]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm5
  !palignr xmm7,xmm4,4
  !paddd xmm6,xmm7
  !sha256msg2 xmm6,xmm5
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm4,xmm5
  ;Rounds 28-31
  !movdqa xmm0,xmm6
  !paddd xmm0,[rax+112]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm6
  !palignr xmm7,xmm5,4
  !paddd xmm3,xmm7
  !sha256msg2 xmm3,xmm6
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm5,xmm6
  ;Rounds 32-35
  !movdqa xmm0,xmm3
  !paddd xmm0,[rax+128]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm3
  !palignr xmm7,xmm6,4
  !paddd xmm4,xmm7
  !sha256msg2 xmm4,xmm3
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm6,xmm3
  ;Rounds 36-39
  !movdqa xmm0,xmm4
  !paddd xmm0,[rax+144]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm4
  !palignr xmm7,xmm3,4
  !paddd xmm5,xmm7
  !sha256msg2 xmm5,xmm4
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm3,xmm4
  ;Rounds 40-43
  !movdqa xmm0,xmm5
  !paddd xmm0,[rax+160]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm5
  !palignr xmm7,xmm4,4
  !paddd xmm6,xmm7
  !sha256msg2 xmm6,xmm5
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm4,xmm5
  ;Rounds 44-47
  !movdqa xmm0,xmm6
  !paddd xmm0,[rax+176]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm6
  !palignr xmm7,xmm5,4
  !paddd xmm3,xmm7
  !sha256msg2 xmm3,xmm6
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm5,xmm6
  ;Rounds 48-51
  !movdqa xmm0,xmm3
  !paddd xmm0,[rax+192]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm3
  !palignr xmm7,xmm6,4
  !paddd xmm4,xmm7
  !sha256msg2 xmm4,xmm3
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  !sha256msg1 xmm6,xmm3
  ;Rounds 52-55
  !movdqa xmm0,xmm4
  !paddd xmm0,[rax+208]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm4
  !palignr xmm7,xmm3,4
  !paddd xmm5,xmm7
  !sha256msg2 xmm5,xmm4
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  ;Rounds 56-59
  !movdqa xmm0,xmm5
  !paddd xmm0,[rax+224]
  !sha256rnds2 xmm2,xmm1
  !movdqa xmm7,xmm5
  !palignr xmm7,xmm4,4
  !paddd xmm6,xmm7
  !sha256msg2 xmm6,xmm5
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  ;Rounds 60-63
  !movdqa xmm0,xmm6
  !paddd xmm0,[rax+240]
  !sha256rnds2 xmm2,xmm1
  !pshufd xmm0,xmm0,0eh
  !sha256rnds2 xmm1,xmm2
  ;Add current hash values with previously saved
  !paddd xmm1,xmm9
  !paddd xmm2,xmm10
  ;Increment Data pointer and loop if more to process
  !dec rcx                     ;Chunks
 !jz @f
  !add rsi,128                 ;Unicode
  !cmp rcx,2
 !jne .Lloop0
  !cmp qword[r10],3
 !jb .Lloop0                  ;rsi ist schon Last_Chunk
  !mov rsi,r8                ;Change Source
 !jmp .Lloop0
 !@@:
  ;Write hash values back in the correct order
  !pshufd xmm1,xmm1,1bh
  !pshufd xmm2,xmm2,0b1h
  !movdqa xmm7,xmm1
  !pblendw xmm1,xmm2,0f0h
  !palignr xmm2,xmm7,8

  !lea rdi,[Varis]
  !movdqu [rdi],xmm1
  !movdqu [rdi+16],xmm2

  For i = ?Varis To ?Varis + 28 Step 4
    Res$ + RSet(Hex(PeekL(i) & $FFFFFFFF), 8, "0")
  Next
;Restore registers in programs
 ProcedureReturn Res$

!Align 16
  ;Constants, old known values
  ;The first 32 bits of the fractional parts of the square roots of the first 8 primes 2..19, Big-Endian!
  ;$6a09e667, $bb67ae85, $3c6ef372, $a54ff53a, $510e527f, $9b05688c, $1f83d9ab, $5be0cd19
  !STATE0 dq 0bb67ae856a09e667h,0a54ff53a3c6ef372h
  !STATE1 dq 9b05688c510e527fh,5be0cd191f83d9abh
  !PSHUFFLE_BYTE_FLIP_MASK dq 0405060700010203h,0c0d0e0f08090a0bh
  !K256:
  !dd 428a2f98h,71374491h,0b5c0fbcfh,0e9b5dba5h
  !dd 3956c25bh,59f111f1h,923f82a4h,0ab1c5ed5h
  !dd 0d807aa98h,12835b01h,243185beh,550c7dc3h
  !dd 72be5d74h,80deb1feh,9bdc06a7h,0c19bf174h
  !dd 0e49b69c1h,0efbe4786h,0fc19dc6h,240ca1cch
  !dd 2de92c6fh,4a7484aah,5cb0a9dch,76f988dah
  !dd 983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h
  !dd 0c6e00bf3h,0d5a79147h,6ca6351h,14292967h
  !dd 27b70a85h,2e1b2138h,4d2c6dfch,53380d13h
  !dd 650a7354h,766a0abbh,81c2c92eh,92722c85h
  !dd 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h
  !dd 0d192e819h,0d6990624h,0f40e3585h,106aa070h
  !dd 19a4c116h,1e376c08h,2748774ch,34b0bcb5h
  !dd 391c0cb3h,4ed8aa4ah,5b9cca4fh,682e6ff3h
  !dd 748f82eeh,78a5636fh,84c87814h,8cc70208h
  !dd 90befffah,0a4506cebh,0bef9a3f7h,0c67178f2h

 DataSection
  ;Variables
  Varis:                     ;For Res_CPU$
  !Varis:
  !times 512 db 0
 EndDataSection
EndProcedure
Result for the Test-String:
CPU: 46FFC4A3DA4F60B940A0058764689FE79863F1C77C1A11230F1CF740666C5748 Time: 238ms
PB: 46FFC4A3DA4F60B940A0058764689FE79863F1C77C1A11230F1CF740666C5748 Time: 2281ms
Have fun!
Helle
User avatar
netmaestro
PureBasic Bullfrog
PureBasic Bullfrog
Posts: 8422
Joined: Wed Jul 06, 2005 5:42 am
Location: Fort Nelson, BC, Canada

Re: SHA256 with CPU-SHA-Instruction-Set

Post by netmaestro »

I get an illegal instruction error on line 309 :cry: (Intel i7)
BERESHEIT
cas
Enthusiast
Enthusiast
Posts: 597
Joined: Mon Nov 03, 2008 9:56 pm

Re: SHA256 with CPU-SHA-Instruction-Set

Post by cas »

You have to compile it with PB x64.
davido
Addict
Addict
Posts: 1890
Joined: Fri Nov 09, 2012 11:04 pm
Location: Uttoxeter, UK

Re: SHA256 with CPU-SHA-Instruction-Set

Post by davido »

I am using an Intel i7 5960 with PureBasic x64 and get:
Debugger wrote:PureBasic.asm [426]
sha256rnds2 xmm2,xmm1
error: illegal instruction.
If I switch off the Debugger, 'ere running, the bracketed number changes from 426 to 330.
DE AA EB
User avatar
bbanelli
Enthusiast
Enthusiast
Posts: 543
Joined: Tue May 28, 2013 10:51 pm
Location: Europe
Contact:

Re: SHA256 with CPU-SHA-Instruction-Set

Post by bbanelli »

Download latest FAsm.

https://flatassembler.net/download.php
version 1.72 (Oct 10, 2017)

[+] Support for Intel AVX-512, SHA, CLFLUSHOPT, CLWB, PCOMMIT, ADX, RDSEED, SMAP and MPX instruction sets.
"If you lie to the compiler, it will get its revenge."
Henry Spencer
https://www.pci-z.com/
Helle
Enthusiast
Enthusiast
Posts: 178
Joined: Wed Apr 12, 2006 7:59 pm
Location: Germany
Contact:

Re: SHA256 with CPU-SHA-Instruction-Set

Post by Helle »

Sorry, but if I install a new PB-version, first action is copy the latest FAsm-version into the PB Compilers-Directory.
SHA-support is new in FAsm-version 1.71.40 (Oct 19, 2015)! 2 years :D !
User_Russian
Addict
Addict
Posts: 1441
Joined: Wed Nov 12, 2008 5:01 pm
Location: Russia

Re: SHA256 with CPU-SHA-Instruction-Set

Post by User_Russian »

Helle wrote: Mon Dec 04, 2017 10:27 pmSHA1 is out
Please add SHA1.
Helle
Enthusiast
Enthusiast
Posts: 178
Joined: Wed Apr 12, 2006 7:59 pm
Location: Germany
Contact:

Re: SHA256 with CPU-SHA-Instruction-Set

Post by Helle »

Ok, but only because I'm the starting author.
No questions to C-Backend!

Code: Select all

;- https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
;- PureBasic 6.02 LTS (x64), ASM-Backend
;- Source for this Test: Unicode String
;- SH1 is unsave (broken)!

;Check for CPU-SHA-Instructions:
!mov eax,7
!xor ecx,ecx
!cpuid
!and ebx,$20000000           ;Bit29 SHA
!jnz IsSHA
MessageRequester("Ooops!", "No CPU-Support for SHA-Instructions!" + #CRLF$ + "End")
End
!IsSHA:

Define.q pSource, Chunks, Chunks64, pDest, Size, Size_BS, Single

Source$ = "The quick brown fox jumps over the lazy dog"    ;or...
For i = 1 To 24
  Source$ + Source$
Next
;Source$ = ""                 ;for Test

;PB_A PB_A PB_A PB_A PB_A PB_A PB_A PB_A PB_A PB_A PB_A PB_A 
;PB-Test
UseSHA1Fingerprint()
TA_PB = ElapsedMilliseconds()
Res_PB$ = StringFingerprint(Source$, #PB_Cipher_SHA1)
TE_PB = ElapsedMilliseconds() - TA_PB
;PB_E PB_E PB_E PB_E PB_E PB_E PB_E PB_E PB_E PB_E PB_E PB_E

;CPU_A CPU_A CPU_A CPU_A CPU_A CPU_A CPU_A CPU_A CPU_A CPU_A 
TA_CPU = ElapsedMilliseconds()
pSource = @Source$

pDest = AllocateMemory(20)

Size = Len(Source$)
Chunks = ((Size + 8) / 64) + 1
Chunks64 = AllocateMemory(Chunks * 64) ;SHL

!mov rax,[v_Size]
!shl rax,3
!bswap rax
!mov [v_Size_BS],rax         ;Bits to encoded

PokeQ(Chunks64 + (Chunks * 64) - 8, Size_BS)

Single = Size % 16
;convert Unicode -> ASCII
If Size > 0
  j = 0
  For i = 0 To Single - 1
    PokeB(Chunks64 + i, PeekW(pSource + j))
    j + 2
  Next
EndIf

!mov rdi,[v_Chunks64]
!add rdi,[v_Single]

!cmp [v_Size],16
!jb ShortString
;convert Unicode -> ASCII
!mov rsi,[v_pSource]
!mov r8,[v_Single]
!shl r8,1
!add rsi,r8

!mov rcx,[v_Size]
!sub rcx,[v_Single]
!shr rcx,4

!xor r8,r8
!mov r9,16
!@@:
!vmovdqu xmm2,[rsi+r8]
!vmovdqu xmm3,[rsi+r9]
!vpacksswb xmm1,xmm2,xmm3
!vmovdqu [rdi],xmm1
!add r8,32
!add r9,32
!add rdi,16
!dec rcx
!jnz @b
!ShortString:

!mov byte[rdi],80h           ;set Marker-Bit

;------

!mov rdi,[v_pDest]
!mov rsi,[v_Chunks64]

;load initial hash values
!movdqu xmm0,dqword[CONST_H0_H4]
!movdqu xmm1,dqword[CONST_H0_H4+16]
!movdqu xmm7,dqword[PSHUFFLE_BYTE_FLIP_MASK]

!mov rcx,[v_Chunks]
!Lloop0:
!movdqa xmm8,xmm1
!movdqa xmm9,xmm0
;Rounds 0-3
!movdqu xmm3,[rsi]
!pshufb xmm3,xmm7
!paddd xmm1,xmm3
!movdqa xmm2,xmm0
!sha1rnds4 xmm0,xmm1,0
;Rounds 4-7
!movdqu xmm4,[rsi+16]
!pshufb xmm4,xmm7
!sha1nexte xmm2,xmm4
!movdqa xmm1,xmm0
!sha1rnds4 xmm0,xmm2,0
!sha1msg1 xmm3,xmm4
;Rounds 8-11
!movdqu xmm5,[rsi+32]
!pshufb xmm5,xmm7
!sha1nexte xmm1,xmm5
!movdqa xmm2,xmm0
!sha1rnds4 xmm0,xmm1,0
!sha1msg1 xmm4,xmm5
!pxor xmm3,xmm5
;Rounds 12-15
!movdqu xmm6,[rsi+48]
!pshufb xmm6,xmm7
!sha1nexte xmm2,xmm6
!movdqa xmm1,xmm0
!sha1msg2 xmm3,xmm6
!sha1rnds4 xmm0,xmm2,0
!sha1msg1 xmm5,xmm6
!pxor xmm4,xmm6
;Rounds 16-19
!sha1nexte xmm1,xmm3
!movdqa xmm2,xmm0
!sha1msg2 xmm4,xmm3
!sha1rnds4 xmm0,xmm1,0
!sha1msg1 xmm6,xmm3
!pxor xmm5,xmm3
;Rounds 20-23
!sha1nexte xmm2,xmm4
!movdqa xmm1,xmm0
!sha1msg2 xmm5,xmm4
!sha1rnds4 xmm0,xmm2,1
!sha1msg1 xmm3,xmm4
!pxor xmm6,xmm4
;Rounds 24-27
!sha1nexte xmm1,xmm5
!movdqa xmm2,xmm0
!sha1msg2 xmm6,xmm5
!sha1rnds4 xmm0,xmm1,1
!sha1msg1 xmm4,xmm5
!pxor xmm3,xmm5
;Rounds 28-31
!sha1nexte xmm2,xmm6
!movdqa xmm1,xmm0
!sha1msg2 xmm3,xmm6
!sha1rnds4 xmm0,xmm2,1
!sha1msg1 xmm5,xmm6
!pxor xmm4,xmm6
;Rounds 32-35
!sha1nexte xmm1,xmm3
!movdqa xmm2,xmm0
!sha1msg2 xmm4,xmm3
!sha1rnds4 xmm0,xmm1,1
!sha1msg1 xmm6,xmm3
!pxor xmm5,xmm3
;Rounds 36-39
!sha1nexte xmm2,xmm4
!movdqa xmm1,xmm0
!sha1msg2 xmm5,xmm4
!sha1rnds4 xmm0,xmm2,1
!sha1msg1 xmm3,xmm4
!pxor xmm6,xmm4
;Rounds 40-43
!sha1nexte xmm1,xmm5
!movdqa xmm2,xmm0
!sha1msg2 xmm6,xmm5
!sha1rnds4 xmm0,xmm1,2
!sha1msg1 xmm4,xmm5
!pxor xmm3,xmm5
;Rounds 44-47
!sha1nexte xmm2,xmm6
!movdqa xmm1,xmm0
!sha1msg2 xmm3,xmm6
!sha1rnds4 xmm0,xmm2,2
!sha1msg1 xmm5,xmm6
!pxor xmm4,xmm6
;Rounds 48-51
!sha1nexte xmm1,xmm3
!movdqa xmm2,xmm0
!sha1msg2 xmm4,xmm3
!sha1rnds4 xmm0,xmm1,2
!sha1msg1 xmm6,xmm3
!pxor xmm5,xmm3
;Rounds 52-55
!sha1nexte xmm2,xmm4
!movdqa xmm1,xmm0
!sha1msg2 xmm5,xmm4
!sha1rnds4 xmm0,xmm2,2
!sha1msg1 xmm3,xmm4
!pxor xmm6,xmm4
;Rounds 56-59
!sha1nexte xmm1,xmm5
!movdqa xmm2,xmm0
!sha1msg2 xmm6,xmm5
!sha1rnds4 xmm0,xmm1,2
!sha1msg1 xmm4,xmm5
!pxor xmm3,xmm5
;Rounds 60-63
!sha1nexte xmm2,xmm6
!movdqa xmm1,xmm0
!sha1msg2 xmm3,xmm6
!sha1rnds4 xmm0,xmm2,3
!sha1msg1 xmm5,xmm6
!pxor xmm4,xmm6
;Rounds 64-67
!sha1nexte xmm1,xmm3
!movdqa xmm2,xmm0
!sha1msg2 xmm4,xmm3
!sha1rnds4 xmm0,xmm1,3
!sha1msg1 xmm6,xmm3
!pxor xmm5,xmm3
;Rounds 68-71
!sha1nexte xmm2,xmm4
!movdqa xmm1,xmm0
!sha1msg2 xmm5,xmm4
!sha1rnds4 xmm0,xmm2,3
!pxor xmm6,xmm4
;Rounds 72-75
!sha1nexte xmm1,xmm5
!movdqa xmm2,xmm0
!sha1msg2 xmm6,xmm5
!sha1rnds4 xmm0,xmm1,3
;Rounds 76-79
!sha1nexte xmm2,xmm6
!movdqa xmm1,xmm0
!sha1rnds4 xmm0,xmm2,3

;Add current hash values with previously saved
!sha1nexte xmm1,xmm8
!paddd xmm0,xmm9

!add rsi,64
!dec rcx                     ;Chunks
!jnz Lloop0

;Write hash values back in the correct order
!pshufd xmm0,xmm0,1bh
!movdqu [rdi],xmm0
!pextrd [rdi+16],xmm1,3

For i = 0 To 16 Step 4
  Res_CPU$ + RSet(Hex(PeekL(pDest + i) & $FFFFFFFF), 8, "0")
Next

FreeMemory(Chunks64)
FreeMemory(pDest)

TE_CPU = ElapsedMilliseconds() - TA_CPU
;CPU_E CPU_E CPU_E CPU_E CPU_E CPU_E CPU_E CPU_E CPU_E CPU_E 

MessageRequester("String-SHA1-CPU-Test", "CPU : " + "Time CPU : " + Str(TE_CPU) + " ms" + #LFCR$ + Res_CPU$ + #LFCR$ + "PB : " + "Time PB : " + Str(TE_PB) + " ms" + #LFCR$ + UCase(Res_PB$)) 

End

DataSection
  !CONST_H0_H4 dd 10325476h,98BADCFEh,0EFCDAB89h,67452301h,0,0,0,0C3D2E1F0h    ;Start-Values H0-H4
  !PSHUFFLE_BYTE_FLIP_MASK dq 08090a0b0c0d0e0fh,0001020304050607h              ;like "bswap"
EndDataSection
User_Russian
Addict
Addict
Posts: 1441
Joined: Wed Nov 12, 2008 5:01 pm
Location: Russia

Re: SHA256 with CPU-SHA-Instruction-Set

Post by User_Russian »

Thank you.
User avatar
Mijikai
Addict
Addict
Posts: 1360
Joined: Sun Sep 11, 2016 2:17 pm

Re: SHA256 with CPU-SHA-Instruction-Set

Post by Mijikai »

Thank you :shock: pure magic 8)
I would suggest to keep it fasm only (as obj), cleaner and usable with the c backenend.
User_Russian
Addict
Addict
Posts: 1441
Joined: Wed Nov 12, 2008 5:01 pm
Location: Russia

Re: SHA256 with CPU-SHA-Instruction-Set

Post by User_Russian »

Module from code Helle.

Code: Select all

; https://www.purebasic.fr/english/viewtopic.php?t=69730

CompilerIf #PB_Compiler_Processor <> #PB_Processor_x64
  CompilerError "Only x64 platform supported"
CompilerEndIf
CompilerIf #PB_Compiler_Version>=600
  CompilerIf #PB_Compiler_Backend <> #PB_Backend_Asm
    CompilerError "Only assembler backend is supported"
  CompilerEndIf
CompilerElse
  CompilerError "FASM does not support SHA1 instructions"
CompilerEndIf


DeclareModule CPU_SHA1
  Declare IsCPU_SHA()
  Declare CPU_StartFingerprint()
  Declare CPU_CopyInstance(*Sha)
  Declare CPU_AddFingerprintBuffer(*Sha, *Buffer, Size)
  Declare.s CPU_GetSHA(*Sha)
  Declare.s CPU_FinishFingerprint(*Sha)
  Declare.s CPU_Fingerprint(*Buffer, Size)
  Declare.s CPU_StringFingerprint(s.s)
EndDeclareModule

Module CPU_SHA1
  EnableExplicit
  
  #SHA1_ChunksSize = 64
  
  Structure SHA_Data Align #PB_Structure_AlignC
    xmm0.M128A
    xmm1.M128A
    AllSize.q
    Buff.a[#SHA1_ChunksSize]
    BuffSize.a
  EndStructure
  
  Procedure SetReg(*Sha.SHA_Data)
    Protected xmm.M128A, *p
    
    xmm = *Sha\xmm0
    !movdqu xmm0, dqword[p.v_xmm]
    xmm = *Sha\xmm1
    !movdqu xmm1, dqword[p.v_xmm]
    *p = ?PSHUFFLE_BYTE_FLIP_MASK
    !mov rdi,[p.p_p]
    !movdqu xmm7,[rdi]
    
    DataSection 
      PSHUFFLE_BYTE_FLIP_MASK:
      Data.q $08090a0b0c0d0e0f, $0001020304050607  ; like "bswap"
    EndDataSection
  EndProcedure
  
  Procedure GetReg(*Sha.SHA_Data)
    Protected *xmm.M128A
    
    *xmm = *Sha\xmm0
    !mov rdi,[p.p_xmm]
    !movdqu [rdi],xmm0
    
    *xmm = *Sha\xmm1
    !mov rdi,[p.p_xmm]
    !movdqu [rdi],xmm1
  EndProcedure
  
  
  Procedure SHA1_Calc(*Buff, CountChunks)
    !mov rsi,[p.p_Buff]
    !mov rcx,[p.v_CountChunks]

    !@@:
    !movdqa xmm8,xmm1
    !movdqa xmm9,xmm0
    ;Rounds 0-3
    !movdqu xmm3,[rsi]
    !pshufb xmm3,xmm7
    !paddd xmm1,xmm3
    !movdqa xmm2,xmm0
    !sha1rnds4 xmm0,xmm1,0
    ;Rounds 4-7
    !movdqu xmm4,[rsi+16]
    !pshufb xmm4,xmm7
    !sha1nexte xmm2,xmm4
    !movdqa xmm1,xmm0
    !sha1rnds4 xmm0,xmm2,0
    !sha1msg1 xmm3,xmm4
    ;Rounds 8-11
    !movdqu xmm5,[rsi+32]
    !pshufb xmm5,xmm7
    !sha1nexte xmm1,xmm5
    !movdqa xmm2,xmm0
    !sha1rnds4 xmm0,xmm1,0
    !sha1msg1 xmm4,xmm5
    !pxor xmm3,xmm5
    ;Rounds 12-15
    !movdqu xmm6,[rsi+48]
    !pshufb xmm6,xmm7
    !sha1nexte xmm2,xmm6
    !movdqa xmm1,xmm0
    !sha1msg2 xmm3,xmm6
    !sha1rnds4 xmm0,xmm2,0
    !sha1msg1 xmm5,xmm6
    !pxor xmm4,xmm6
    ;Rounds 16-19
    !sha1nexte xmm1,xmm3
    !movdqa xmm2,xmm0
    !sha1msg2 xmm4,xmm3
    !sha1rnds4 xmm0,xmm1,0
    !sha1msg1 xmm6,xmm3
    !pxor xmm5,xmm3
    ;Rounds 20-23
    !sha1nexte xmm2,xmm4
    !movdqa xmm1,xmm0
    !sha1msg2 xmm5,xmm4
    !sha1rnds4 xmm0,xmm2,1
    !sha1msg1 xmm3,xmm4
    !pxor xmm6,xmm4
    ;Rounds 24-27
    !sha1nexte xmm1,xmm5
    !movdqa xmm2,xmm0
    !sha1msg2 xmm6,xmm5
    !sha1rnds4 xmm0,xmm1,1
    !sha1msg1 xmm4,xmm5
    !pxor xmm3,xmm5
    ;Rounds 28-31
    !sha1nexte xmm2,xmm6
    !movdqa xmm1,xmm0
    !sha1msg2 xmm3,xmm6
    !sha1rnds4 xmm0,xmm2,1
    !sha1msg1 xmm5,xmm6
    !pxor xmm4,xmm6
    ;Rounds 32-35
    !sha1nexte xmm1,xmm3
    !movdqa xmm2,xmm0
    !sha1msg2 xmm4,xmm3
    !sha1rnds4 xmm0,xmm1,1
    !sha1msg1 xmm6,xmm3
    !pxor xmm5,xmm3
    ;Rounds 36-39
    !sha1nexte xmm2,xmm4
    !movdqa xmm1,xmm0
    !sha1msg2 xmm5,xmm4
    !sha1rnds4 xmm0,xmm2,1
    !sha1msg1 xmm3,xmm4
    !pxor xmm6,xmm4
    ;Rounds 40-43
    !sha1nexte xmm1,xmm5
    !movdqa xmm2,xmm0
    !sha1msg2 xmm6,xmm5
    !sha1rnds4 xmm0,xmm1,2
    !sha1msg1 xmm4,xmm5
    !pxor xmm3,xmm5
    ;Rounds 44-47
    !sha1nexte xmm2,xmm6
    !movdqa xmm1,xmm0
    !sha1msg2 xmm3,xmm6
    !sha1rnds4 xmm0,xmm2,2
    !sha1msg1 xmm5,xmm6
    !pxor xmm4,xmm6
    ;Rounds 48-51
    !sha1nexte xmm1,xmm3
    !movdqa xmm2,xmm0
    !sha1msg2 xmm4,xmm3
    !sha1rnds4 xmm0,xmm1,2
    !sha1msg1 xmm6,xmm3
    !pxor xmm5,xmm3
    ;Rounds 52-55
    !sha1nexte xmm2,xmm4
    !movdqa xmm1,xmm0
    !sha1msg2 xmm5,xmm4
    !sha1rnds4 xmm0,xmm2,2
    !sha1msg1 xmm3,xmm4
    !pxor xmm6,xmm4
    ;Rounds 56-59
    !sha1nexte xmm1,xmm5
    !movdqa xmm2,xmm0
    !sha1msg2 xmm6,xmm5
    !sha1rnds4 xmm0,xmm1,2
    !sha1msg1 xmm4,xmm5
    !pxor xmm3,xmm5
    ;Rounds 60-63
    !sha1nexte xmm2,xmm6
    !movdqa xmm1,xmm0
    !sha1msg2 xmm3,xmm6
    !sha1rnds4 xmm0,xmm2,3
    !sha1msg1 xmm5,xmm6
    !pxor xmm4,xmm6
    ;Rounds 64-67
    !sha1nexte xmm1,xmm3
    !movdqa xmm2,xmm0
    !sha1msg2 xmm4,xmm3
    !sha1rnds4 xmm0,xmm1,3
    !sha1msg1 xmm6,xmm3
    !pxor xmm5,xmm3
    ;Rounds 68-71
    !sha1nexte xmm2,xmm4
    !movdqa xmm1,xmm0
    !sha1msg2 xmm5,xmm4
    !sha1rnds4 xmm0,xmm2,3
    !pxor xmm6,xmm4
    ;Rounds 72-75
    !sha1nexte xmm1,xmm5
    !movdqa xmm2,xmm0
    !sha1msg2 xmm6,xmm5
    !sha1rnds4 xmm0,xmm1,3
    ;Rounds 76-79
    !sha1nexte xmm2,xmm6
    !movdqa xmm1,xmm0
    !sha1rnds4 xmm0,xmm2,3
    
    ;Add current hash values with previously saved
    !sha1nexte xmm1,xmm8
    !paddd xmm0,xmm9
    
    !add rsi,64
    !dec rcx                     ;Chunks
    !jnz @b
  EndProcedure
  
  
  ; *****************
  ;- **** Public ****
  ; *****************
  
  Procedure IsCPU_SHA()
    Protected x.l=0
    
    !mov eax,7
    !xor ecx,ecx
    !cpuid
    !mov dword [p.v_x], ebx
    
    ProcedureReturn Bool(x & $20000000)
  EndProcedure
  
  Procedure CPU_StartFingerprint()
    Protected *p.SHA_Data = 0
    
    *p = AllocateStructure(SHA_Data)
    If *p
      With *p
        \BuffSize=0
        \AllSize=0
        CopyMemory(?CONST_H0_H4, @\xmm0, 16)
        CopyMemory(?CONST_H0_H4+16, @\xmm1, 16)
      EndWith
    EndIf
    
    ProcedureReturn *p
    
    DataSection
      CONST_H0_H4:
      Data.l $10325476, $98BADCFE, $0EFCDAB89, $67452301, 0, 0, 0, $0C3D2E1F0  ; Start-Values H0-H4
    EndDataSection
  EndProcedure
  
  Procedure CPU_CopyInstance(*Sha.SHA_Data)
    Protected *p=0
    
    If *Sha
      *p = AllocateStructure(SHA_Data)
      If *p
        CopyStructure(*Sha, *p, SHA_Data)
      EndIf
    EndIf
    
    ProcedureReturn *p
  EndProcedure
  
  Procedure CPU_AddFingerprintBuffer(*Sha.SHA_Data, *Buffer, Size)
    Protected r = #False, *p=0, x, BuffPos=0
    
    If *Sha And *Buffer And Size>0
      r = #True
      With *Sha
        If \BuffSize>0
          If \BuffSize+Size<=#SHA1_ChunksSize
            CopyMemory(*Buffer, @\Buff[\BuffSize], Size)
            \BuffSize+Size
            Size=0
          Else
            x=#SHA1_ChunksSize-\BuffSize
            CopyMemory(*Buffer, @\Buff[\BuffSize], x)
            SetReg(*Sha)
            SHA1_Calc(@\Buff, 1)
            GetReg(*Sha)
            \AllSize + #SHA1_ChunksSize
            \BuffSize=0
            BuffPos = x
            Size - x
          EndIf
        EndIf
        
        If Size>0
          x = Size / #SHA1_ChunksSize
          If x > 0
            SetReg(*Sha)
            SHA1_Calc(*Buffer + BuffPos, x)
            GetReg(*Sha)
            \AllSize + x * #SHA1_ChunksSize
          EndIf
          
          x = Size % #SHA1_ChunksSize
          If x>0
            CopyMemory(*Buffer + BuffPos + (Size-x), @\Buff[\BuffSize], x)
            \BuffSize+x
          EndIf
        EndIf
      EndWith
      
    EndIf
    
    ProcedureReturn r
  EndProcedure
  
  Procedure.s CPU_GetSHA(*Sha.SHA_Data)
    Protected r.s, Size, *p=0, i
    If *Sha
      *p = AllocateMemory(#SHA1_ChunksSize)
      If *p
        
        With *Sha
          If (\BuffSize=0 Or \BuffSize>=56) And \BuffSize<#SHA1_ChunksSize
            \Buff[\BuffSize]=$80
            For i=\BuffSize+1 To #SHA1_ChunksSize-1
              \Buff[i]=0
            Next
            i=\AllSize+\BuffSize
            !mov rax,[p.v_i]
            !shl rax,3
            !bswap rax
            !mov [p.v_i],rax  ;Bits to encoded
            If \BuffSize=0
              PokeB(*p, $80)
            EndIf
            PokeQ(*p + #SHA1_ChunksSize - 8, i)
            SetReg(*Sha)
            If \BuffSize>0
              SHA1_Calc(@\Buff, 1)
            EndIf
            SHA1_Calc(*p, 1)
          ElseIf \BuffSize<56
            \Buff[\BuffSize]=$80
            i=\AllSize+\BuffSize
            !mov rax,[p.v_i]
            !shl rax,3
            !bswap rax
            !mov [p.v_i],rax  ;Bits to encoded
            PokeQ(@\Buff[56], i)
            SetReg(*Sha)
            SHA1_Calc(@\Buff, 1)
          Else
            CallDebugger
          EndIf
        EndWith
        
        !mov rdi,[p.p_p]
        !pshufd xmm0,xmm0,1bh
        !movdqu [rdi],xmm0
        !pextrd [rdi+16],xmm1,3
        
        For i = 0 To 16 Step 4
          r + RSet(Hex(PeekL(*p + i) & $FFFFFFFF), 8, "0")
        Next
        
        FreeMemory(*p)
      EndIf
    EndIf
    ProcedureReturn r
  EndProcedure
  
  Procedure.s CPU_FinishFingerprint(*Sha.SHA_Data)
    Protected r.s
    
    If *Sha
      r = CPU_GetSHA(*Sha)
      FreeStructure(*Sha)
    EndIf
    
    ProcedureReturn r
  EndProcedure
  
  Procedure.s CPU_Fingerprint(*Buffer, Size)
    Protected r.s, *p
    
    *p=CPU_StartFingerprint()
    If *p
      CPU_AddFingerprintBuffer(*p, *Buffer, Size)
      r = CPU_FinishFingerprint(*p)
    EndIf
    
    ProcedureReturn r
  EndProcedure
  
  
  Procedure.s CPU_StringFingerprint(s.s)
    Protected r.s, *p=UTF8(s)
    
    If *p
      r = CPU_Fingerprint(*p, MemorySize(*p)-1)
      FreeMemory(*p)
    EndIf
    
    ProcedureReturn r
  EndProcedure
EndModule
Examples

Code: Select all

UseModule CPU_SHA1

If IsCPU_SHA()
  Debug "SHA instructions supported"
Else
  Debug "SHA instructions are NOT supported"
EndIf

Code: Select all

UseSHA1Fingerprint()

UseModule CPU_SHA1

s.s="В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!"
Debug CPU_StringFingerprint(s)
Debug StringFingerprint(s, #PB_Cipher_SHA1)

Debug "----"

s="The quick brown fox jumps over the lazy dog."
Debug CPU_StringFingerprint(s)
Debug StringFingerprint(s, #PB_Cipher_SHA1)

Code: Select all

UseSHA1Fingerprint()

UseModule CPU_SHA1

Count = 100
Size=10*1024*1024
*Buff = AllocateMemory(Size)
If *Buff
  RandomData(*Buff, Size)
  
  Define sPB.s, sCPU.s, tPB, tCPU, i
  
  tPB = ElapsedMilliseconds()
  If StartFingerprint(0, #PB_Cipher_SHA1)
    For i = 1 To Count
      AddFingerprintBuffer(0, *Buff, Size)
    Next
    sPB = UCase(FinishFingerprint(0))
  EndIf
  tPB = ElapsedMilliseconds() - tPB
  
  tCPU = ElapsedMilliseconds()
  *p=CPU_StartFingerprint()
  If *p
    For i = 1 To Count
      CPU_AddFingerprintBuffer(*p, *Buff, Size)
    Next
    sCPU = CPU_FinishFingerprint(*p)
  EndIf
  tCPU = ElapsedMilliseconds() - tCPU
  
  MessageRequester("",~"PB \nTime = "+tPB+~" ms\n"+sPB+~"\n\nCPU\nTime = "+tCPU+~" ms\n"+sCPU)
  
  FreeMemory(*Buff)
EndIf
Post Reply