Netmaestro, I realized the plain asm approach you used in your other SHA routines that are 32 bit might be faster after all for 64 bit compared to SSE2.
The speed increase that was visible was in the block tests that handle two quads at a time but that isn't the case anymore.
Here's my plain ASM approach with sigma macros and the suggestion Thorium did earlier to use some extra SSE2 registers for temporary storage.
Code: Select all
;====================================================================================
;
; Library Commands: sha512FingerPrint()
; sha512FileFingerPrint()
; sha384FingerPrint()
; sha384FileFingerPrint()
;
; Author: Lloyd Gallant (netmaestro)
;
; Contributors: Thanks to wilbert, Danilo, thorium, infratec and idle
; for their help with the asm routines
; and to Christopher Devine for the
; c code this program is based on.
;
; Date: August 15, 2011
; Target Compiler: Purebasic 4 and up
; Target OS: Windows, Linux, MacOS
;
; License: GNU General Public License
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License As published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY Or FITNESS For A PARTICULAR PURPOSE. See the
; GNU General Public License For more details.
;
; The logic for this program is based on sha256.c found here:
;
; http://www.spale.com/download/scrypt/scrypt1.0/
;
; You can test the accuracy of this program by comparing results with
; test data at:
;
; http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/SHA2_Additional.pdf
;
;===================================================================================
;
;
; Usage:
;
; result$ = sha512Fingerprint(*address, length, [ ,*progress ] )
; result$ = sha512FileFingerprint(file$, [ ,*progress ] )
;
; result$ = sha384Fingerprint(*address, length, [ ,*progress ] )
; result$ = sha384FileFingerprint(file$, [ ,*progress ] )
;
; Progress callback function:
;
; Procedure MyCallBack(value.i)
; ; value is 0 to 100 representing percentage completed
; Endprocedure
;
;
;================================================================
; STRUCTURES
;================================================================
Structure sha512_context
total.q
state.q [8]
buffer.a [128]
EndStructure
Structure UINT8_BUFFER
b.a[128]
EndStructure
Structure UINT64_BUFFER
w.q[80]
EndStructure
Structure msglen
lowpart.q
highpart.q
EndStructure
;================================================================
; HELPER MACROS
;================================================================
Macro F0(x,y,z)
((x & y) | (z & (x | y)))
EndMacro
Macro F1(x,y,z)
(z ! (x & (y ! z)))
EndMacro
Macro SIGMA01() ; By wilbert
!movd xmm0, ebx
!mov eax,dword [v_SigmaTemp02]
!mov ebx, eax
!mov ecx, eax
!movd xmm1, eax
!mov edx,dword [v_SigmaTemp02 + 4]
; rotate right 1
!shrd eax, edx, 1
; rotate right 8
!shrd ebx, edx, 8
; shift right 7
!shrd ecx, edx, 7
!xor eax, ebx
!xor eax, ecx
!mov dword [v_SigmaTemp02], eax
!movd eax, xmm1
!mov ebx, edx
!mov ecx, edx
!shrd edx, eax, 1
!shrd ebx, eax, 8
!shr ecx, 7
!xor edx, ebx
!xor edx, ecx
!mov dword [v_SigmaTemp02 + 4], edx
!mov eax,dword [v_SigmaTemp13]
!mov ebx, eax
!mov ecx, eax
!movd xmm1, eax
!mov edx,dword [v_SigmaTemp13 + 4]
; rotate right 19
!shrd eax, edx, 19
; rotate right 61 = left 3
!shld ebx, edx, 3
; shift right 6
!shrd ecx, edx, 6
!xor eax, ebx
!xor eax, ecx
!mov dword [v_SigmaTemp13], eax
!movd eax, xmm1
!mov ebx, edx
!mov ecx, edx
!shrd edx, eax, 19
!shld ebx, eax, 3
!shr ecx, 6
!xor edx, ebx
!xor edx, ecx
!mov dword [v_SigmaTemp13 + 4], edx
!movd ebx, xmm0
EndMacro
Global.q SigmaTemp02, SigmaTemp13
Macro SIGMA23() ; By wilbert
!movd xmm0, ebx
!mov eax,dword [v_SigmaTemp02]
!mov ebx, eax
!mov ecx, eax
!movd xmm1, eax
!mov edx,dword [v_SigmaTemp02 + 4]
; rotate right 28
!shrd eax, edx, 28
; rotate right 34 = left 30
!shld ebx, edx, 30
; rotate right 39 = left 25
!shld ecx, edx, 25
!xor eax, ebx
!xor eax, ecx
!mov dword [v_SigmaTemp02], eax
!movd eax, xmm1
!mov ebx, edx
!mov ecx, edx
!shrd edx, eax, 28
!shld ebx, eax, 30
!shld ecx, eax, 25
!xor edx, ebx
!xor edx, ecx
!mov dword [v_SigmaTemp02 + 4], edx
!mov eax,dword [v_SigmaTemp13]
!mov ebx, eax
!mov ecx, eax
!movd xmm1, eax
!mov edx,dword [v_SigmaTemp13 + 4]
; rotate right 14
!shrd eax, edx, 14
; rotate right 18
!shrd ebx, edx, 18
; rotate right 41 = left 23
!shld ecx, edx, 23
!xor eax, ebx
!xor eax, ecx
!mov dword [v_SigmaTemp13], eax
!movd eax, xmm1
!mov ebx, edx
!mov ecx, edx
!shrd edx, eax, 14
!shrd ebx, eax, 18
!shld ecx, eax, 23
!xor edx, ebx
!xor edx, ecx
!mov dword [v_SigmaTemp13 + 4], edx
!movd ebx, xmm0
EndMacro
Procedure.q ChgEnd64Addr__() ; By idle
; small adaptation by Wilbert to
; work around a Purebasic OS X bug
!mov eax,ChgEnd64_start
ProcedureReturn
!ChgEnd64_start:
!mov eax, [esp + 8]
!mov edx, [esp + 4]
!bswap eax
!bswap edx
!ret
EndProcedure
PrototypeC.q ProtoChgEnd64(value.q)
Global ChangeEndian64.ProtoChgEnd64 = ChgEnd64Addr__()
Macro R(t)
SigmaTemp02 = w\w[t-15]
SigmaTemp13 = w\w[t-2]
Sigma01()
w\w[t] = SigmaTemp13 + w\W[t - 7] + SigmaTemp02 + w\w[t - 16]
EndMacro
Macro P(a,b,c,d,e,f,g,h,x,K)
SigmaTemp02 = a
SigmaTemp13 = e
Sigma23()
temp1 = h + SigmaTemp13 + F1(e,f,g) + k + x
temp2 = SigmaTemp02 + F0(a,b,c)
d + temp1
h = temp1 + temp2
EndMacro
;================================================================
; LOCAL FUNCTIONS
;================================================================
Procedure sha384_starts(*ctx.sha512_context )
*ctx\state[0] = $cbbb9d5dc1059ed8
*ctx\state[1] = $629a292a367cd507
*ctx\state[2] = $9159015a3070dd17
*ctx\state[3] = $152fecd8f70e5939
*ctx\state[4] = $67332667ffc00b31
*ctx\state[5] = $8eb44a8768581511
*ctx\state[6] = $db0c2e0d64f98fa7
*ctx\state[7] = $47b5481dbefa4fa4
EndProcedure
Procedure sha512_starts( *ctx.sha512_context )
*ctx\state[0] = $6a09e667f3bcc908
*ctx\state[1] = $bb67ae8584caa73b
*ctx\state[2] = $3c6ef372fe94f82b
*ctx\state[3] = $a54ff53a5f1d36f1
*ctx\state[4] = $510e527fade682d1
*ctx\state[5] = $9b05688c2b3e6c1f
*ctx\state[6] = $1f83d9abfb41bd6b
*ctx\state[7] = $5be0cd19137e2179
EndProcedure
Procedure sha512_process( *ctx.sha512_context, *bytes.UINT64_BUFFER)
W.UINT64_BUFFER
Define.q A, B, C, D, E, F, G, H
Define.q temp1, temp2
For i=0 To 15
w\w[i] = ChangeEndian64( *bytes\w[i] )
Next
A = *ctx\state[0]
B = *ctx\state[1]
C = *ctx\state[2]
D = *ctx\state[3]
E = *ctx\state[4]
F = *ctx\state[5]
G = *ctx\state[6]
H = *ctx\state[7]
For ii=16 To 79
R(ii)
Next
P( A, B, C, D, E, F, G, H, w\w[ 0], $428A2F98D728AE22 )
P( H, A, B, C, D, E, F, G, w\w[ 1], $7137449123EF65CD )
P( G, H, A, B, C, D, E, F, w\w[ 2], $B5C0FBCFEC4D3B2F )
P( F, G, H, A, B, C, D, E, w\w[ 3], $E9B5DBA58189DBBC )
P( E, F, G, H, A, B, C, D, w\w[ 4], $3956C25BF348B538 )
P( D, E, F, G, H, A, B, C, w\w[ 5], $59F111F1B605D019 )
P( C, D, E, F, G, H, A, B, w\w[ 6], $923F82A4AF194F9B )
P( B, C, D, E, F, G, H, A, w\w[ 7], $AB1C5ED5DA6D8118 )
P( A, B, C, D, E, F, G, H, w\w[ 8], $D807AA98A3030242 )
P( H, A, B, C, D, E, F, G, w\w[ 9], $12835B0145706FBE )
P( G, H, A, B, C, D, E, F, w\w[10], $243185BE4EE4B28C )
P( F, G, H, A, B, C, D, E, w\w[11], $550C7DC3D5FFB4E2 )
P( E, F, G, H, A, B, C, D, w\w[12], $72BE5D74F27B896F )
P( D, E, F, G, H, A, B, C, w\w[13], $80DEB1FE3B1696B1 )
P( C, D, E, F, G, H, A, B, w\w[14], $9BDC06A725C71235 )
P( B, C, D, E, F, G, H, A, w\w[15], $C19BF174CF692694 )
P( A, B, C, D, E, F, G, H, w\w[16], $E49B69C19EF14AD2 )
P( H, A, B, C, D, E, F, G, w\w[17], $EFBE4786384F25E3 )
P( G, H, A, B, C, D, E, F, w\w[18], $0FC19DC68B8CD5B5 )
P( F, G, H, A, B, C, D, E, w\w[19], $240CA1CC77AC9C65 )
P( E, F, G, H, A, B, C, D, w\w[20], $2DE92C6F592B0275 )
P( D, E, F, G, H, A, B, C, w\w[21], $4A7484AA6EA6E483 )
P( C, D, E, F, G, H, A, B, w\w[22], $5CB0A9DCBD41FBD4 )
P( B, C, D, E, F, G, H, A, w\w[23], $76F988DA831153B5 )
P( A, B, C, D, E, F, G, H, w\w[24], $983E5152EE66DFAB )
P( H, A, B, C, D, E, F, G, w\w[25], $A831C66D2DB43210 )
P( G, H, A, B, C, D, E, F, w\w[26], $B00327C898FB213F )
P( F, G, H, A, B, C, D, E, w\w[27], $BF597FC7BEEF0EE4 )
P( E, F, G, H, A, B, C, D, w\w[28], $C6E00BF33DA88FC2 )
P( D, E, F, G, H, A, B, C, w\w[29], $D5A79147930AA725 )
P( C, D, E, F, G, H, A, B, w\w[30], $06CA6351E003826F )
P( B, C, D, E, F, G, H, A, w\w[31], $142929670A0E6E70 )
P( A, B, C, D, E, F, G, H, w\w[32], $27B70A8546D22FFC )
P( H, A, B, C, D, E, F, G, w\w[33], $2E1B21385C26C926 )
P( G, H, A, B, C, D, E, F, w\w[34], $4D2C6DFC5AC42AED )
P( F, G, H, A, B, C, D, E, w\w[35], $53380D139D95B3DF )
P( E, F, G, H, A, B, C, D, w\w[36], $650A73548BAF63DE )
P( D, E, F, G, H, A, B, C, w\w[37], $766A0ABB3C77B2A8 )
P( C, D, E, F, G, H, A, B, w\w[38], $81C2C92E47EDAEE6 )
P( B, C, D, E, F, G, H, A, w\w[39], $92722C851482353B )
P( A, B, C, D, E, F, G, H, w\w[40], $A2BFE8A14CF10364 )
P( H, A, B, C, D, E, F, G, w\w[41], $A81A664BBC423001 )
P( G, H, A, B, C, D, E, F, w\w[42], $C24B8B70D0F89791 )
P( F, G, H, A, B, C, D, E, w\w[43], $C76C51A30654BE30 )
P( E, F, G, H, A, B, C, D, w\w[44], $D192E819D6EF5218 )
P( D, E, F, G, H, A, B, C, w\w[45], $D69906245565A910 )
P( C, D, E, F, G, H, A, B, w\w[46], $F40E35855771202A )
P( B, C, D, E, F, G, H, A, w\w[47], $106AA07032BBD1B8 )
P( A, B, C, D, E, F, G, H, w\w[48], $19A4C116B8D2D0C8 )
P( H, A, B, C, D, E, F, G, w\w[49], $1E376C085141AB53 )
P( G, H, A, B, C, D, E, F, w\w[50], $2748774CDF8EEB99 )
P( F, G, H, A, B, C, D, E, w\w[51], $34B0BCB5E19B48A8 )
P( E, F, G, H, A, B, C, D, w\w[52], $391C0CB3C5C95A63 )
P( D, E, F, G, H, A, B, C, w\w[53], $4ED8AA4AE3418ACB )
P( C, D, E, F, G, H, A, B, w\w[54], $5B9CCA4F7763E373 )
P( B, C, D, E, F, G, H, A, w\w[55], $682E6FF3D6B2B8A3 )
P( A, B, C, D, E, F, G, H, w\w[56], $748F82EE5DEFB2FC )
P( H, A, B, C, D, E, F, G, w\w[57], $78A5636F43172F60 )
P( G, H, A, B, C, D, E, F, w\w[58], $84C87814A1F0AB72 )
P( F, G, H, A, B, C, D, E, w\w[59], $8CC702081A6439EC )
P( E, F, G, H, A, B, C, D, w\w[60], $90BEFFFA23631E28 )
P( D, E, F, G, H, A, B, C, w\w[61], $A4506CEBDE82BDE9 )
P( C, D, E, F, G, H, A, B, w\w[62], $BEF9A3F7B2C67915 )
P( B, C, D, E, F, G, H, A, w\w[63], $C67178F2E372532B )
P( A, B, C, D, E, F, G, H, w\w[64], $CA273ECEEA26619C )
P( H, A, B, C, D, E, F, G, w\w[65], $D186B8C721C0C207 )
P( G, H, A, B, C, D, E, F, w\w[66], $EADA7DD6CDE0EB1E )
P( F, G, H, A, B, C, D, E, w\w[67], $F57D4F7FEE6ED178 )
P( E, F, G, H, A, B, C, D, w\w[68], $06F067AA72176FBA )
P( D, E, F, G, H, A, B, C, w\w[69], $0A637DC5A2C898A6 )
P( C, D, E, F, G, H, A, B, w\w[70], $113F9804BEF90DAE )
P( B, C, D, E, F, G, H, A, w\w[71], $1B710B35131C471B )
P( A, B, C, D, E, F, G, H, w\w[72], $28DB77F523047D84 )
P( H, A, B, C, D, E, F, G, w\w[73], $32CAAB7B40C72493 )
P( G, H, A, B, C, D, E, F, w\w[74], $3C9EBE0A15C9BEBC )
P( F, G, H, A, B, C, D, E, w\w[75], $431D67C49C100D4C )
P( E, F, G, H, A, B, C, D, w\w[76], $4CC5D4BECB3E42B6 )
P( D, E, F, G, H, A, B, C, w\w[77], $597F299CFC657E2A )
P( C, D, E, F, G, H, A, B, w\w[78], $5FCB6FAB3AD6FAEC )
P( B, C, D, E, F, G, H, A, w\w[79], $6C44198C4A475817 )
*ctx\state[0] + A
*ctx\state[1] + B
*ctx\state[2] + C
*ctx\state[3] + D
*ctx\state[4] + E
*ctx\state[5] + F
*ctx\state[6] + G
*ctx\state[7] + H
EndProcedure
Procedure shaQuad_update( *ctx.sha512_context, *input, length, jobsize.q, *callback )
Static totalprocessed.d=0
Define.l left, fill
If length=0 : ProcedureReturn : EndIf
left = *ctx\total & 127
fill = 128-left
*ctx\total + length
If left And (length >= fill)
CopyMemory( *input, @*ctx\buffer[0]+left, fill )
sha512_process( *ctx, @*ctx\buffer[0] )
length - fill
*input + fill
left = 0
EndIf
While length >= 128
sha512_process( *ctx, *input )
length - 128
*input + 128
totalprocessed+128
If *callback
progress = Int(totalprocessed/jobsize*100)
CallFunctionFast(*callback, progress )
EndIf
Wend
If length
CopyMemory( *input, @*ctx\buffer[0]+left, length )
EndIf
EndProcedure
Procedure shaQuad_finish( *ctx.sha512_context, *digest.UINT64_BUFFER, jobsize.q, full, *callback )
Define.l last, padn
msglen.msglen
sha512_padding.UINT8_BUFFER
sha512_padding\b[0]=$80
msglen\highpart = ChangeEndian64(jobsize<<3)
last = *ctx\total & 127
If last<112
padn = 112-last
Else
padn = 240-last
EndIf
shaQuad_update( *ctx, @sha512_padding, padn, padn, *callback )
shaQuad_update( *ctx, @msglen, 16, 16, *callback )
For i=0 To 5
*digest\w[i] = ChangeEndian64(*ctx\state[i])
Next
If full
For i=6 To 7
*digest\w[i] = ChangeEndian64(*ctx\state[i])
Next
EndIf
EndProcedure
;================================================================
; EXPORTED FUNCTIONS
;================================================================
Procedure.s shaQuadFingerprint(*datapointer, Length, full, *callback=0) ; Data address, data size, [ ,<procaddress> ]
*ctx.sha512_context = AllocateMemory(SizeOf(sha512_context))
*sha512sum = AllocateMemory(64)
Protected digest.s
If full
sha512_starts( *ctx )
Else
sha384_starts( *ctx )
EndIf
shaQuad_update( *ctx, *datapointer, Length, Length, *callback )
shaQuad_finish( *ctx, *sha512sum, Length, full, *callback )
*output = AllocateMemory(129)
If full
iterations = 63
Else
iterations = 47
EndIf
For j = 0 To iterations
PokeS( *output+j*2, LCase( RSet( Hex( PeekA(*sha512sum+j) ), 2, "0" ) ) )
Next
digest = PeekS(*output)
FreeMemory(*output)
FreeMemory(*sha512sum)
FreeMemory(*ctx)
ProcedureReturn digest
EndProcedure
ProcedureDLL.s sha512Fingerprint(*datapointer, Length, *callback=0) ; Data address, data size, [ ,<procaddress> ]
ProcedureReturn shaQuadFingerprint(*datapointer, Length, 1, *callback)
EndProcedure
ProcedureDLL.s sha384Fingerprint(*datapointer, Length, *callback=0) ; Data address, data size, [ ,<procaddress> ]
ProcedureReturn shaQuadFingerprint(*datapointer, Length, 0, *callback)
EndProcedure
Procedure.s shaQuadFileFingerprint(filename.s, full, *callback=0) ; filename$, [ ,<procaddress> ]
Protected digest.s
Protected *datapointer, *output, *ctx.sha512_context
Protected bytesread, fresult
Protected jobsize.q
*ctx = AllocateMemory(SizeOf(sha512_context))
*sha512sum = AllocateMemory(64)
*output = AllocateMemory(129)
*datapointer = AllocateMemory(4096)
fresult = OpenFile(#PB_Any, filename)
If fresult
jobsize = Lof(fresult)
If full
sha512_starts( *ctx )
Else
sha384_starts( *ctx )
EndIf
While Not Eof(fresult)
bytesread = ReadData(fresult, *datapointer, 4096)
shaQuad_update( *ctx, *datapointer, bytesread, jobsize, *callback )
Wend
shaQuad_finish( *ctx, *sha512sum, jobsize, full, *callback )
If full
iterations = 63
Else
iterations = 47
EndIf
For j = 0 To iterations
PokeS( *output+j*2, LCase( RSet( Hex( PeekA(*sha512sum+j) ), 2, "0" ) ) )
Next
digest = PeekS(*output)
CloseFile(fresult)
EndIf
FreeMemory(*datapointer)
FreeMemory(*output)
FreeMemory(*sha512sum)
FreeMemory(*ctx)
ProcedureReturn digest
EndProcedure
ProcedureDLL.s sha512FileFingerprint(filename.s, *callback=0) ; Data address, data size, [ ,<procaddress> ]
ProcedureReturn shaQuadFileFingerprint(filename.s, 1, *callback)
EndProcedure
ProcedureDLL.s sha384FileFingerprint(filename.s, *callback=0) ; Data address, data size, [ ,<procaddress> ]
ProcedureReturn shaQuadFileFingerprint(filename.s, 0, *callback)
EndProcedure
The speed difference between Sigma01 and Sigma23 as a procedure compared to as a function is very little so procedures might be better if the size of the compiled code is important.