Page 3 of 5

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Wed Aug 17, 2011 6:22 pm
by wilbert
Yeah, all the way up to xmm15 :shock:
Well, on 32 bit there's also mm0 - mm7 but I don't know if they are equally fast.

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Wed Aug 17, 2011 6:26 pm
by netmaestro
mm0-mm7 alias the floating point registers, right? If you use them is it enough to execute emms when you're finished or do you have to do more to preserve the floating point state?

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Wed Aug 17, 2011 6:41 pm
by wilbert
Yes, they are alias for fpu registers and emms should be enough.
You also have movq2dq and movdq2q to move between mmx and sse registers.

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Wed Aug 17, 2011 9:26 pm
by Thorium
wilbert wrote:Yeah, all the way up to xmm15 :shock:
Well, on 32 bit there's also mm0 - mm7 but I don't know if they are equally fast.
Yeah pretty cool.
The last days i did wrote some ARMv7 ASM code and boy, ARM has a huge amount of registers: 16 general purpose registers and 32 VFP (FPU) registers. :shock:

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 2:35 pm
by wilbert
Here's my final attempt.

EDIT:It should work on Windows also now (worked only on OS X first) thanks to the correction Netmaestro suggested.

I changed the sha512_context structure so the state is first. That was a little easier for the ASM routine.
It was a lot of work to rewrite the process routine. Hopefully the speed is fast enough now :)

Code: Select all

;====================================================================================
;
; Library Commands:         sha512FingerPrint()
;                           sha512FileFingerPrint()
;                           sha384FingerPrint()
;                           sha384FileFingerPrint()
;
; Author:                   Lloyd Gallant (netmaestro)
;
; Contributors:             Thanks to wilbert, Danilo, thorium, infratec and idle
;                           for their help with the asm routines
;                           and to Christopher Devine for the
;                           c code this program is based on.
; 
; Date:                     August 15, 2011
; Target Compiler:          Purebasic 4 and up
; Target OS:                Windows, Linux, MacOS
; 
; License:                  GNU General Public License
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License As published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY Or FITNESS For A PARTICULAR PURPOSE.  See the
; GNU General Public License For more details.
;
; The logic for this program is based on sha256.c found here:
;
; http://www.spale.com/download/scrypt/scrypt1.0/
;
; You can test the accuracy of this program by comparing results with 
; test data at:
;
; http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/SHA2_Additional.pdf
;
;===================================================================================
;
; 
; Usage: 
;
; result$ = sha512Fingerprint(*address, length, [ ,*progress ] )
; result$ = sha512FileFingerprint(file$, [ ,*progress ] )
;
; result$ = sha384Fingerprint(*address, length, [ ,*progress ] )
; result$ = sha384FileFingerprint(file$, [ ,*progress ] )
;
; Progress callback function:
;
; Procedure MyCallBack(value.i)
;   ; value is 0 to 100 representing percentage completed
; Endprocedure 
;
;
;================================================================
;                      STRUCTURES                      
;================================================================

Structure sha512_context
  state.q [8]
  total.q  
  buffer.a [128]
EndStructure

Structure UINT8_BUFFER
  b.a[128]
EndStructure

Structure UINT64_BUFFER
  w.q[80]
EndStructure

Structure msglen
  lowpart.q
  highpart.q
EndStructure

;================================================================
;                      HELPER MACROS                     
;================================================================

Macro SIGMA0M(reg) ; By wilbert
  !movq xmm7, reg
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 1
  !psrlq reg, 1
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm6, reg
  ; delta rotate 8 - 1 = 7 right
  !psrlq xmm6, 7
  !pshufd xmm6, xmm6, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm6
  ; shift right 7
  !psrlq xmm7, 7
  !pxor reg, xmm7
EndMacro

Macro SIGMA1M(reg) ; By wilbert
  !movq xmm7, reg
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 19
  !psrlq reg, 19
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm6, reg
  ; delta rotate 61 - 19 = 42 right = 22 left
  !pshufd xmm6, xmm6, 0x41; 1 - 0 - 0 - 1 
  !psllq xmm6, 22
  !pshufd xmm6, xmm6, 0x7d; 1 - 3 - 3 - 1
  !pxor reg, xmm6
  ; shift right 6
  !psrlq xmm7, 6
  !pxor reg, xmm7
EndMacro

Macro SIGMA2M(reg) ; By wilbert
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 28
  !psrlq reg, 28
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm7, reg
  ; delta rotate 34 - 28 = 6 right
  !psrlq xmm7, 6
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
  ; delta rotate 39 - 34 = 5 right
  !psrlq xmm7, 5
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
EndMacro

Macro SIGMA3M(reg) ; By wilbert
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 14
  !psrlq reg, 14
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm7, reg
  ; delta rotate 18 - 14 = 4 right
  !psrlq xmm7, 4
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
  ; delta rotate 41 - 18 = 23 right
  !psrlq xmm7, 23
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
EndMacro

Macro P(a,b,c,d,e,f,g,h, offset) ; By wilbert
  !movq2dq xmm0, a
  !movq2dq xmm1, e
  !movq xmm6, xmm0
  !movq xmm5, xmm1
  SIGMA2M(xmm0); xmm0 = temp1 = Sigma2(a)
  SIGMA3M(xmm1); xmm1 = temp2 = Sigma3(e)
  !movq2dq xmm2, h
  !paddq xmm1, xmm2; temp2 + h
  !movq xmm2, [esi + ecx + offset]
  !paddq xmm1, xmm2; temp2 + *sha512constants [ ]
  !movq xmm2, [edi + ecx + offset]
  !paddq xmm1, xmm2; temp2 + *w [ ]
  ; F0 = ((a & b) | (c & (a | b)))
  !movq2dq xmm3, b
  !movq xmm4, xmm6
  !por xmm4, xmm3; xmm4 = a | b
  !pand xmm6, xmm3; xmm6 = a & b
  !movq2dq xmm3, c
  !pand xmm3, xmm4; xmm3 = c & (a | b)
  !por xmm6, xmm3; xmm6 = F0
  !paddq xmm0, xmm6; temp1 + F0
  ; F1 = (g ! (e & (f ! g)))
  !movq2dq xmm2, f
  !movq2dq xmm3, g
  !pxor xmm2, xmm3; xmm2 = f ! g
  !pand xmm5, xmm2; xmm5 = e & (f ! g)
  !pxor xmm3, xmm5; xmm3 = F1
  !paddq xmm1, xmm3; temp2 + F1
  !movq2dq xmm2, d
  !paddq xmm2, xmm1
  !movdq2q d, xmm2; d + temp2
  !paddq xmm0, xmm1
  !movdq2q h, xmm0; h = temp 1 + temp2
EndMacro

Macro ADD_RESULT(reg, offset) ; By wilbert
  !paddq reg, [edx + offset]
  !movq [edx + offset], reg
EndMacro

Macro DEF_Q(h1, l1, h2, l2, h3, l3, h4, l4) ; By wilbert
  !dd l1,h1,l2,h2,l3,h3,l4,h4
EndMacro

DataSection
!sha512constants:
DEF_Q(0x428a2f98,0xd728ae22 , 0x71374491,0x23ef65cd , 0xb5c0fbcf,0xec4d3b2f , 0xe9b5dba5,0x8189dbbc)
DEF_Q(0x3956c25b,0xf348b538 , 0x59f111f1,0xb605d019 , 0x923f82a4,0xaf194f9b , 0xab1c5ed5,0xda6d8118)
DEF_Q(0xd807aa98,0xa3030242 , 0x12835b01,0x45706fbe , 0x243185be,0x4ee4b28c , 0x550c7dc3,0xd5ffb4e2)
DEF_Q(0x72be5d74,0xf27b896f , 0x80deb1fe,0x3b1696b1 , 0x9bdc06a7,0x25c71235 , 0xc19bf174,0xcf692694)
DEF_Q(0xe49b69c1,0x9ef14ad2 , 0xefbe4786,0x384f25e3 , 0x0fc19dc6,0x8b8cd5b5 , 0x240ca1cc,0x77ac9c65)
DEF_Q(0x2de92c6f,0x592b0275 , 0x4a7484aa,0x6ea6e483 , 0x5cb0a9dc,0xbd41fbd4 , 0x76f988da,0x831153b5)
DEF_Q(0x983e5152,0xee66dfab , 0xa831c66d,0x2db43210 , 0xb00327c8,0x98fb213f , 0xbf597fc7,0xbeef0ee4)
DEF_Q(0xc6e00bf3,0x3da88fc2 , 0xd5a79147,0x930aa725 , 0x06ca6351,0xe003826f , 0x14292967,0x0a0e6e70)
DEF_Q(0x27b70a85,0x46d22ffc , 0x2e1b2138,0x5c26c926 , 0x4d2c6dfc,0x5ac42aed , 0x53380d13,0x9d95b3df)
DEF_Q(0x650a7354,0x8baf63de , 0x766a0abb,0x3c77b2a8 , 0x81c2c92e,0x47edaee6 , 0x92722c85,0x1482353b)
DEF_Q(0xa2bfe8a1,0x4cf10364 , 0xa81a664b,0xbc423001 , 0xc24b8b70,0xd0f89791 , 0xc76c51a3,0x0654be30)
DEF_Q(0xd192e819,0xd6ef5218 , 0xd6990624,0x5565a910 , 0xf40e3585,0x5771202a , 0x106aa070,0x32bbd1b8)
DEF_Q(0x19a4c116,0xb8d2d0c8 , 0x1e376c08,0x5141ab53 , 0x2748774c,0xdf8eeb99 , 0x34b0bcb5,0xe19b48a8)
DEF_Q(0x391c0cb3,0xc5c95a63 , 0x4ed8aa4a,0xe3418acb , 0x5b9cca4f,0x7763e373 , 0x682e6ff3,0xd6b2b8a3)
DEF_Q(0x748f82ee,0x5defb2fc , 0x78a5636f,0x43172f60 , 0x84c87814,0xa1f0ab72 , 0x8cc70208,0x1a6439ec)
DEF_Q(0x90befffa,0x23631e28 , 0xa4506ceb,0xde82bde9 , 0xbef9a3f7,0xb2c67915 , 0xc67178f2,0xe372532b)
DEF_Q(0xca273ece,0xea26619c , 0xd186b8c7,0x21c0c207 , 0xeada7dd6,0xcde0eb1e , 0xf57d4f7f,0xee6ed178)
DEF_Q(0x06f067aa,0x72176fba , 0x0a637dc5,0xa2c898a6 , 0x113f9804,0xbef90dae , 0x1b710b35,0x131c471b)
DEF_Q(0x28db77f5,0x23047d84 , 0x32caab7b,0x40c72493 , 0x3c9ebe0a,0x15c9bebc , 0x431d67c4,0x9c100d4c)
DEF_Q(0x4cc5d4be,0xcb3e42b6 , 0x597f299c,0xfc657e2a , 0x5fcb6fab,0x3ad6faec , 0x6c44198c,0x4a475817)
EndDataSection

Procedure ChgEnd64Addr__() ; By idle
  ; small adaptation by Wilbert to
  ; work around a Purebasic OS X bug
  !mov eax,ChgEnd64_start
  ProcedureReturn
  !ChgEnd64_start:  
  !mov  eax, [esp + 8]
  !mov  edx, [esp + 4]
  !bswap eax
  !bswap edx 
  !ret
EndProcedure

PrototypeC.q ProtoChgEnd64(value.q)
Global ChangeEndian64.ProtoChgEnd64 = ChgEnd64Addr__()

;================================================================
;                  LOCAL FUNCTIONS
;================================================================

Procedure sha384_starts(*ctx.sha512_context )
  
  *ctx\state[0] = $cbbb9d5dc1059ed8
  *ctx\state[1] = $629a292a367cd507
  *ctx\state[2] = $9159015a3070dd17
  *ctx\state[3] = $152fecd8f70e5939
  *ctx\state[4] = $67332667ffc00b31
  *ctx\state[5] = $8eb44a8768581511
  *ctx\state[6] = $db0c2e0d64f98fa7
  *ctx\state[7] = $47b5481dbefa4fa4
  
EndProcedure 

Procedure sha512_starts( *ctx.sha512_context )
  
  *ctx\state[0] = $6a09e667f3bcc908
  *ctx\state[1] = $bb67ae8584caa73b
  *ctx\state[2] = $3c6ef372fe94f82b
  *ctx\state[3] = $a54ff53a5f1d36f1
  *ctx\state[4] = $510e527fade682d1
  *ctx\state[5] = $9b05688c2b3e6c1f
  *ctx\state[6] = $1f83d9abfb41bd6b
  *ctx\state[7] = $5be0cd19137e2179
  
EndProcedure

Procedure sha512_process_addr__() ; By wilbert
  !mov eax, sha512_process_start
  ProcedureReturn
  !sha512_process_start:
  !push esi
  !push edi
  !mov edx, [esp + 12]; edx = *ctx.sha512_context
  !mov esi, [esp + 16]; esi = *bytes.UINT64_BUFFER
  !sub esp, 656
  !mov edi, esp
  !and edi, 0xfffffff0; edi = *w.UINT64_BUFFER
  
  ; copy from *bytes to *w and change endian
  !mov ecx, 120
  !changeEndianLoop:
  !mov eax, [esi + ecx]
  !bswap eax
  !mov [edi + ecx + 4], eax
  !mov eax, [esi + ecx + 4]
  !bswap eax
  !mov [edi + ecx], eax
  !sub ecx, 8
  !jnc changeEndianLoop
  
  ; sigma 0 & sigma1 loop
  !mov ecx, 128
  !sigma01Loop:
  !movq xmm2, [edi + ecx - 128]; w\w[t-16]
  !movq xmm0, [edi + ecx - 120]; w\w[t-15]
  !movq xmm3, [edi + ecx - 56]; w\w[t-7]
  !movq xmm1, [edi + ecx - 16]; w\w[t-2]
  SIGMA0M(xmm0)
  SIGMA1M(xmm1)
  !paddq xmm0, xmm1
  !paddq xmm0, xmm2
  !paddq xmm0, xmm3
  !movq [edi + ecx], xmm0
  !add ecx, 8
  !cmp ecx, 640
  !jne sigma01Loop
  
  !movq mm0, [edx]
  !movq mm1, [edx + 8]
  !movq mm2, [edx + 16]
  !movq mm3, [edx + 24]
  !movq mm4, [edx + 32]
  !movq mm5, [edx + 40]
  !movq mm6, [edx + 48]
  !movq mm7, [edx + 56]
  
  !mov esi, sha512constants; esi = *sha512constants
  !xor ecx, ecx
  ; loop 10 times
  !sigma23Loop:
  P( mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0)
  P( mm7, mm0, mm1, mm2, mm3, mm4, mm5, mm6, 8)
  P( mm6, mm7, mm0, mm1, mm2, mm3, mm4, mm5, 16)
  P( mm5, mm6, mm7, mm0, mm1, mm2, mm3, mm4, 24)
  P( mm4, mm5, mm6, mm7, mm0, mm1, mm2, mm3, 32)
  P( mm3, mm4, mm5, mm6, mm7, mm0, mm1, mm2, 40)
  P( mm2, mm3, mm4, mm5, mm6, mm7, mm0, mm1, 48)
  P( mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm0, 56)
  !add ecx, 64
  !cmp ecx, 640
  !jne sigma23Loop
  
  ADD_RESULT(mm0, 0)
  ADD_RESULT(mm1, 8)
  ADD_RESULT(mm2, 16)
  ADD_RESULT(mm3, 24)
  ADD_RESULT(mm4, 32)
  ADD_RESULT(mm5, 40)
  ADD_RESULT(mm6, 48)
  ADD_RESULT(mm7, 56)
    
  !emms
  !add esp, 656
  !pop edi
  !pop esi
  !ret
EndProcedure

PrototypeC sha512_process_proto(*ctx.sha512_context, *bytes.UINT64_BUFFER)
Global sha512_process.sha512_process_proto = sha512_process_addr__()

Procedure shaQuad_update( *ctx.sha512_context, *input, length, jobsize.q, *callback )
  
  Static totalprocessed.d=0
  
  Define.l left, fill
  
  If length=0 : ProcedureReturn : EndIf
  
  left = *ctx\total & 127
  fill = 128-left
  
  *ctx\total + length
  
  If left And (length >= fill)
    CopyMemory( *input, @*ctx\buffer[0]+left, fill )
    sha512_process( *ctx, @*ctx\buffer[0] )
    length - fill
    *input + fill
    left = 0
  EndIf
  
  While length >= 128 
    sha512_process( *ctx, *input )
    length - 128
    *input + 128
    totalprocessed+128
    
    If *callback
      progress = Int(totalprocessed/jobsize*100)
      CallFunctionFast(*callback, progress )
    EndIf
    
  Wend
  
  If length 
    CopyMemory(  *input, @*ctx\buffer[0]+left, length )
  EndIf
  
EndProcedure

Procedure shaQuad_finish( *ctx.sha512_context, *digest.UINT64_BUFFER, jobsize.q, full, *callback )
  
  Define.l last, padn
  msglen.msglen
  sha512_padding.UINT8_BUFFER
  sha512_padding\b[0]=$80
  
  msglen\highpart = ChangeEndian64(jobsize<<3)
  
  last = *ctx\total & 127
  If last<112
    padn = 112-last
  Else
    padn = 240-last
  EndIf
  
  shaQuad_update( *ctx, @sha512_padding, padn, padn, *callback )
  shaQuad_update( *ctx, @msglen, 16, 16, *callback )
  
  For i=0 To 5
    *digest\w[i] = ChangeEndian64(*ctx\state[i])
  Next
  
  If full
    For i=6 To 7
      *digest\w[i] = ChangeEndian64(*ctx\state[i])
    Next
  EndIf
  
EndProcedure

;================================================================
;                  EXPORTED FUNCTIONS
;================================================================

Procedure.s shaQuadFingerprint(*datapointer, Length, full, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  
  *ctx.sha512_context = AllocateMemory(SizeOf(sha512_context))
  *sha512sum = AllocateMemory(64)
  
  Protected digest.s
  
  If full
    sha512_starts( *ctx )
  Else
    sha384_starts( *ctx )
  EndIf
  
  shaQuad_update( *ctx, *datapointer, Length, Length, *callback )
  shaQuad_finish( *ctx, *sha512sum, Length, full, *callback )
  
  *output = AllocateMemory(129)
  
  If full
    iterations = 63
  Else
    iterations = 47
  EndIf
  
  For j = 0 To iterations
    PokeS( *output+j*2, LCase( RSet( Hex( PeekA(*sha512sum+j) ), 2, "0" ) ) )
  Next
  digest = PeekS(*output)
  
  FreeMemory(*output)
  FreeMemory(*sha512sum)
  FreeMemory(*ctx)
  
  ProcedureReturn digest
  
EndProcedure

ProcedureDLL.s sha512Fingerprint(*datapointer, Length, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFingerprint(*datapointer, Length, 1, *callback)
EndProcedure

ProcedureDLL.s sha384Fingerprint(*datapointer, Length, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFingerprint(*datapointer, Length, 0, *callback)
EndProcedure

Procedure.s shaQuadFileFingerprint(filename.s, full, *callback=0) ; filename$, [ ,<procaddress> ]
  
  Protected digest.s
  Protected *datapointer, *output, *ctx.sha512_context
  Protected bytesread, fresult
  Protected jobsize.q
  
  *ctx         = AllocateMemory(SizeOf(sha512_context))
  *sha512sum   = AllocateMemory(64)
  *output      = AllocateMemory(129)
  *datapointer = AllocateMemory(4096)
  fresult      = OpenFile(#PB_Any, filename)
  
  If fresult
    jobsize = Lof(fresult)
    
    If full
      sha512_starts( *ctx )
    Else
      sha384_starts( *ctx )
    EndIf
    
    While Not Eof(fresult)
      bytesread = ReadData(fresult, *datapointer, 4096)
      shaQuad_update( *ctx, *datapointer, bytesread, jobsize, *callback )
    Wend
    shaQuad_finish( *ctx, *sha512sum, jobsize, full, *callback )
    
    If full
      iterations = 63
    Else
      iterations = 47
    EndIf
    
    For j = 0 To iterations
      PokeS( *output+j*2, LCase( RSet( Hex( PeekA(*sha512sum+j) ), 2, "0" ) ) )
    Next
    digest = PeekS(*output)
    CloseFile(fresult)
  EndIf
  
  FreeMemory(*datapointer)
  FreeMemory(*output)
  FreeMemory(*sha512sum)
  FreeMemory(*ctx)
  
  ProcedureReturn digest
  
EndProcedure

ProcedureDLL.s sha512FileFingerprint(filename.s, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFileFingerprint(filename.s, 1, *callback)
EndProcedure

ProcedureDLL.s sha384FileFingerprint(filename.s, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFileFingerprint(filename.s, 0, *callback)
EndProcedure

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 3:40 pm
by netmaestro
Wow :shock: That is really a fast program! It's almost as fast as the 32bit version. I got it to run on Windows by removing these lines:

Code: Select all

!section .data
!align 16
[../]
!section .text
and replacing them with the Purebasic DataSection-EndDataSection block. Then it runs fine. Results are accurate and the speed is phenomenal. This one is more your program than mine now :lol:

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 4:10 pm
by wilbert
The changes you proposed with the DataSection - EndDataSection work fine on OS X also.
Feel free to make the changes :)

Yes, it is pretty fast. Using registers helps a lot.
It was quite a challenge to fit everything in the available registers.
Is it still slower compared to Hashslash ?

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 4:15 pm
by netmaestro
Is it still slower compared to Hashslash ?
No, not at all. For SHA512 hashes your code beats hashslash by 50% - what takes your code 4 seconds takes them 6. And your quad code is equalling the speed of my dword code, which means I've got some work to do.

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 4:21 pm
by wilbert
netmaestro wrote:which means I've got some work to do.
:)

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 4:52 pm
by wilbert
Thorium wrote:The last days i did wrote some ARMv7 ASM code and boy, ARM has a huge amount of registers: 16 general purpose registers and 32 VFP (FPU) registers. :shock:
That's a lot.
My first experience with ASM was a Zilog Z80 cpu running at 3.57 Mhz. :shock:

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 4:56 pm
by netmaestro
Mine was this program :mrgreen:

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 6:07 pm
by Thorium

Code: Select all

  !movq xmm0, [edi + ecx - 120]; w\w[t-15]
  !movq xmm1, [edi + ecx - 16]; w\w[t-2]
  !movq xmm2, [edi + ecx - 128]; w\w[t-16]
  !movq xmm3, [edi + ecx - 56]; w\w[t-7]
That actualy could mess up the cache.

Try if this is a bit faster:

Code: Select all

  !movq xmm2, [edi + ecx - 128]; w\w[t-16]
  !movq xmm0, [edi + ecx - 120]; w\w[t-15]
  !movq xmm3, [edi + ecx - 56]; w\w[t-7]
  !movq xmm1, [edi + ecx - 16]; w\w[t-2]
Maybe it is faster, maybe not. But you should read memory in one direction to benefit from the prefetching.

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 6:29 pm
by wilbert
I don't see a speed difference but I also see no reason why not to change it if it is better.
I simply didn't consider prefetching.

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Thu Aug 18, 2011 6:42 pm
by netmaestro
Posted code is updated, thanks for that.

Re: SHA512/384 Fingerprint and FileFingerprint

Posted: Sun Aug 21, 2011 7:54 am
by wilbert
I made a few small changes.
The callback procedure if specified, was called every 128 bytes which is very often if you process a large file.
I made a few changes so it is called less often and also added a call to the callback with 100% at the end since users using a callback might expect that. Currently it didn't always reach 100.
The other change I made is that I changed the LCase / RSet / Hex with one custom function that takes three parameters
HexBytes(*addr, numBytes, *buffer)
First is the address to read from, second the number of bytes to process, third the buffer to place the result in.

Code: Select all

;====================================================================================
;
; Library Commands:         sha512FingerPrint()
;                           sha512FileFingerPrint()
;                           sha384FingerPrint()
;                           sha384FileFingerPrint()
;
; Authors:                  Lloyd Gallant (netmaestro) and Wilbert
;
; Contributors:             Thanks to Danilo, thorium, infratec and idle
;                           for their help with the asm routines
;                           and to Christopher Devine for the
;                           c code this program is based on.
; 
; Date:                     August 15, 2011
; Target Compiler:          Purebasic 4 and up
; Target OS:                Windows, Linux, MacOS
; 
; License:                  GNU General Public License
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License As published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY Or FITNESS For A PARTICULAR PURPOSE.  See the
; GNU General Public License For more details.
;
; The logic for this program is based on sha256.c found here:
;
; http://www.spale.com/download/scrypt/scrypt1.0/
;
; You can test the accuracy of this program by comparing results with 
; test data at:
;
; http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/SHA2_Additional.pdf
;
;===================================================================================
;
; 
; Usage: 
;
; result$ = sha512Fingerprint(*address, length, [ ,*progress ] )
; result$ = sha512FileFingerprint(file$, [ ,*progress ] )
;
; result$ = sha384Fingerprint(*address, length, [ ,*progress ] )
; result$ = sha384FileFingerprint(file$, [ ,*progress ] )
;
; Progress callback function:
;
; Procedure MyCallBack(value.i)
;   ; value is 0 to 100 representing percentage completed
; Endprocedure 
;
;
;================================================================
;                      STRUCTURES                      
;================================================================

Structure sha512_context
  state.q [8]
  total.q  
  buffer.a [128]
EndStructure

Structure UINT8_BUFFER
  b.a[128]
EndStructure

Structure UINT64_BUFFER
  w.q[80]
EndStructure

Structure msglen
  lowpart.q
  highpart.q
EndStructure

;================================================================
;                      HELPER MACROS                     
;================================================================

Macro SIGMA0M(reg) ; By wilbert
  !movq xmm7, reg
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 1
  !psrlq reg, 1
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm6, reg
  ; delta rotate 8 - 1 = 7 right
  !psrlq xmm6, 7
  !pshufd xmm6, xmm6, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm6
  ; shift right 7
  !psrlq xmm7, 7
  !pxor reg, xmm7
EndMacro

Macro SIGMA1M(reg) ; By wilbert
  !movq xmm7, reg
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 19
  !psrlq reg, 19
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm6, reg
  ; delta rotate 61 - 19 = 42 right = 22 left
  !pshufd xmm6, xmm6, 0x41; 1 - 0 - 0 - 1 
  !psllq xmm6, 22
  !pshufd xmm6, xmm6, 0x7d; 1 - 3 - 3 - 1
  !pxor reg, xmm6
  ; shift right 6
  !psrlq xmm7, 6
  !pxor reg, xmm7
EndMacro

Macro SIGMA2M(reg) ; By wilbert
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 28
  !psrlq reg, 28
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm7, reg
  ; delta rotate 34 - 28 = 6 right
  !psrlq xmm7, 6
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
  ; delta rotate 39 - 34 = 5 right
  !psrlq xmm7, 5
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
EndMacro

Macro SIGMA3M(reg) ; By wilbert
  !pshufd reg, reg, 0x14; 0 - 1 - 1 - 0  
  ; rotate right 14
  !psrlq reg, 14
  !pshufd reg, reg, 0x28; 0 - 2 - 2 - 0
  !movdqa xmm7, reg
  ; delta rotate 18 - 14 = 4 right
  !psrlq xmm7, 4
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
  ; delta rotate 41 - 18 = 23 right
  !psrlq xmm7, 23
  !pshufd xmm7, xmm7, 0x28; 0 - 2 - 2 - 0
  !pxor reg, xmm7
EndMacro

Macro P(a,b,c,d,e,f,g,h, offset) ; By wilbert
  !movq2dq xmm0, a
  !movq2dq xmm1, e
  !movq xmm6, xmm0
  !movq xmm5, xmm1
  SIGMA2M(xmm0); xmm0 = temp1 = Sigma2(a)
  SIGMA3M(xmm1); xmm1 = temp2 = Sigma3(e)
  !movq2dq xmm2, h
  !paddq xmm1, xmm2; temp2 + h
  !movq xmm2, [esi + ecx + offset]
  !paddq xmm1, xmm2; temp2 + *sha512constants [ ]
  !movq xmm2, [edi + ecx + offset]
  !paddq xmm1, xmm2; temp2 + *w [ ]
  ; F0 = ((a & b) | (c & (a | b)))
  !movq2dq xmm3, b
  !movq xmm4, xmm6
  !por xmm4, xmm3; xmm4 = a | b
  !pand xmm6, xmm3; xmm6 = a & b
  !movq2dq xmm3, c
  !pand xmm3, xmm4; xmm3 = c & (a | b)
  !por xmm6, xmm3; xmm6 = F0
  !paddq xmm0, xmm6; temp1 + F0
  ; F1 = (g ! (e & (f ! g)))
  !movq2dq xmm2, f
  !movq2dq xmm3, g
  !pxor xmm2, xmm3; xmm2 = f ! g
  !pand xmm5, xmm2; xmm5 = e & (f ! g)
  !pxor xmm3, xmm5; xmm3 = F1
  !paddq xmm1, xmm3; temp2 + F1
  !movq2dq xmm2, d
  !paddq xmm2, xmm1
  !movdq2q d, xmm2; d + temp2
  !paddq xmm0, xmm1
  !movdq2q h, xmm0; h = temp 1 + temp2
EndMacro

Macro ADD_RESULT(reg, offset) ; By wilbert
  !paddq reg, [edx + offset]
  !movq [edx + offset], reg
EndMacro

Macro DEF_Q(h1, l1, h2, l2, h3, l3, h4, l4) ; By wilbert
  !dd l1,h1,l2,h2,l3,h3,l4,h4
EndMacro

DataSection
!sha512constants:
DEF_Q(0x428a2f98,0xd728ae22 , 0x71374491,0x23ef65cd , 0xb5c0fbcf,0xec4d3b2f , 0xe9b5dba5,0x8189dbbc)
DEF_Q(0x3956c25b,0xf348b538 , 0x59f111f1,0xb605d019 , 0x923f82a4,0xaf194f9b , 0xab1c5ed5,0xda6d8118)
DEF_Q(0xd807aa98,0xa3030242 , 0x12835b01,0x45706fbe , 0x243185be,0x4ee4b28c , 0x550c7dc3,0xd5ffb4e2)
DEF_Q(0x72be5d74,0xf27b896f , 0x80deb1fe,0x3b1696b1 , 0x9bdc06a7,0x25c71235 , 0xc19bf174,0xcf692694)
DEF_Q(0xe49b69c1,0x9ef14ad2 , 0xefbe4786,0x384f25e3 , 0x0fc19dc6,0x8b8cd5b5 , 0x240ca1cc,0x77ac9c65)
DEF_Q(0x2de92c6f,0x592b0275 , 0x4a7484aa,0x6ea6e483 , 0x5cb0a9dc,0xbd41fbd4 , 0x76f988da,0x831153b5)
DEF_Q(0x983e5152,0xee66dfab , 0xa831c66d,0x2db43210 , 0xb00327c8,0x98fb213f , 0xbf597fc7,0xbeef0ee4)
DEF_Q(0xc6e00bf3,0x3da88fc2 , 0xd5a79147,0x930aa725 , 0x06ca6351,0xe003826f , 0x14292967,0x0a0e6e70)
DEF_Q(0x27b70a85,0x46d22ffc , 0x2e1b2138,0x5c26c926 , 0x4d2c6dfc,0x5ac42aed , 0x53380d13,0x9d95b3df)
DEF_Q(0x650a7354,0x8baf63de , 0x766a0abb,0x3c77b2a8 , 0x81c2c92e,0x47edaee6 , 0x92722c85,0x1482353b)
DEF_Q(0xa2bfe8a1,0x4cf10364 , 0xa81a664b,0xbc423001 , 0xc24b8b70,0xd0f89791 , 0xc76c51a3,0x0654be30)
DEF_Q(0xd192e819,0xd6ef5218 , 0xd6990624,0x5565a910 , 0xf40e3585,0x5771202a , 0x106aa070,0x32bbd1b8)
DEF_Q(0x19a4c116,0xb8d2d0c8 , 0x1e376c08,0x5141ab53 , 0x2748774c,0xdf8eeb99 , 0x34b0bcb5,0xe19b48a8)
DEF_Q(0x391c0cb3,0xc5c95a63 , 0x4ed8aa4a,0xe3418acb , 0x5b9cca4f,0x7763e373 , 0x682e6ff3,0xd6b2b8a3)
DEF_Q(0x748f82ee,0x5defb2fc , 0x78a5636f,0x43172f60 , 0x84c87814,0xa1f0ab72 , 0x8cc70208,0x1a6439ec)
DEF_Q(0x90befffa,0x23631e28 , 0xa4506ceb,0xde82bde9 , 0xbef9a3f7,0xb2c67915 , 0xc67178f2,0xe372532b)
DEF_Q(0xca273ece,0xea26619c , 0xd186b8c7,0x21c0c207 , 0xeada7dd6,0xcde0eb1e , 0xf57d4f7f,0xee6ed178)
DEF_Q(0x06f067aa,0x72176fba , 0x0a637dc5,0xa2c898a6 , 0x113f9804,0xbef90dae , 0x1b710b35,0x131c471b)
DEF_Q(0x28db77f5,0x23047d84 , 0x32caab7b,0x40c72493 , 0x3c9ebe0a,0x15c9bebc , 0x431d67c4,0x9c100d4c)
DEF_Q(0x4cc5d4be,0xcb3e42b6 , 0x597f299c,0xfc657e2a , 0x5fcb6fab,0x3ad6faec , 0x6c44198c,0x4a475817)
EndDataSection

Procedure HexBytesAddr__() ; By Wilbert
  !mov eax,hexBytes_start
  ProcedureReturn
  !hexBytes_start:
  !push esi
  !push edi
  !mov esi, [esp + 12]
  !mov ecx, [esp + 16]
  !mov edi, [esp + 20]
  !push edi
  !hexBytes_loop:
  !lodsb
  !mov ah, al
  !shr al, 4
  !and ax, 0x0f0f
  !or ax, 0x3030
  !cmp ah, 0x3a
  !jb hexBytes_1
  !add ah, 39
  !hexBytes_1:
  !cmp al, 0x3a
  !jb hexBytes_2
  !add al, 39
  !hexBytes_2:
  !stosw
  !loop hexBytes_loop
  !pop eax
  !pop edi
  !pop esi
  !ret
EndProcedure

PrototypeC.l ProtoHexBytes(*addr, numBytes, *buffer)
Global HexBytes.ProtoHexBytes = HexBytesAddr__()

Procedure ChgEnd64Addr__() ; By idle
  ; small adaptation by Wilbert to
  ; work around a Purebasic OS X bug
  !mov eax,ChgEnd64_start
  ProcedureReturn
  !ChgEnd64_start:  
  !mov  eax, [esp + 8]
  !mov  edx, [esp + 4]
  !bswap eax
  !bswap edx 
  !ret
EndProcedure

PrototypeC.q ProtoChgEnd64(value.q)
Global ChangeEndian64.ProtoChgEnd64 = ChgEnd64Addr__()

;================================================================
;                  LOCAL FUNCTIONS
;================================================================

Procedure sha384_starts(*ctx.sha512_context )
  
  *ctx\state[0] = $cbbb9d5dc1059ed8
  *ctx\state[1] = $629a292a367cd507
  *ctx\state[2] = $9159015a3070dd17
  *ctx\state[3] = $152fecd8f70e5939
  *ctx\state[4] = $67332667ffc00b31
  *ctx\state[5] = $8eb44a8768581511
  *ctx\state[6] = $db0c2e0d64f98fa7
  *ctx\state[7] = $47b5481dbefa4fa4
  
EndProcedure 

Procedure sha512_starts( *ctx.sha512_context )
  
  *ctx\state[0] = $6a09e667f3bcc908
  *ctx\state[1] = $bb67ae8584caa73b
  *ctx\state[2] = $3c6ef372fe94f82b
  *ctx\state[3] = $a54ff53a5f1d36f1
  *ctx\state[4] = $510e527fade682d1
  *ctx\state[5] = $9b05688c2b3e6c1f
  *ctx\state[6] = $1f83d9abfb41bd6b
  *ctx\state[7] = $5be0cd19137e2179
  
EndProcedure

Procedure sha512_process_addr__() ; By wilbert
  !mov eax, sha512_process_start
  ProcedureReturn
  !sha512_process_start:
  !push esi
  !push edi
  !mov edx, [esp + 12]; edx = *ctx.sha512_context
  !mov esi, [esp + 16]; esi = *bytes.UINT64_BUFFER
  !sub esp, 656
  !mov edi, esp
  !add edi, 0xf
  !and edi, 0xfffffff0; edi = *w.UINT64_BUFFER
  
  ; copy from *bytes to *w and change endian
  !mov ecx, 120
  !changeEndianLoop:
  !mov eax, [esi + ecx]
  !bswap eax
  !mov [edi + ecx + 4], eax
  !mov eax, [esi + ecx + 4]
  !bswap eax
  !mov [edi + ecx], eax
  !sub ecx, 8
  !jnc changeEndianLoop
  
  ; sigma 0 & sigma1 loop
  !mov ecx, 128
  !sigma01Loop:
  !movq xmm2, [edi + ecx - 128]; w\w[t-16]
  !movq xmm0, [edi + ecx - 120]; w\w[t-15]
  !movq xmm3, [edi + ecx - 56] ; w\w[t-7]
  !movq xmm1, [edi + ecx - 16] ; w\w[t-2]
  SIGMA0M(xmm0)
  SIGMA1M(xmm1)
  !paddq xmm0, xmm1
  !paddq xmm0, xmm2
  !paddq xmm0, xmm3
  !movq [edi + ecx], xmm0
  !add ecx, 8
  !cmp ecx, 640
  !jne sigma01Loop
  
  !movq mm0, [edx]
  !movq mm1, [edx + 8]
  !movq mm2, [edx + 16]
  !movq mm3, [edx + 24]
  !movq mm4, [edx + 32]
  !movq mm5, [edx + 40]
  !movq mm6, [edx + 48]
  !movq mm7, [edx + 56]
  
  !mov esi, sha512constants; esi = *sha512constants
  !xor ecx, ecx
  ; loop 10 times
  !sigma23Loop:
  P( mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0)
  P( mm7, mm0, mm1, mm2, mm3, mm4, mm5, mm6, 8)
  P( mm6, mm7, mm0, mm1, mm2, mm3, mm4, mm5, 16)
  P( mm5, mm6, mm7, mm0, mm1, mm2, mm3, mm4, 24)
  P( mm4, mm5, mm6, mm7, mm0, mm1, mm2, mm3, 32)
  P( mm3, mm4, mm5, mm6, mm7, mm0, mm1, mm2, 40)
  P( mm2, mm3, mm4, mm5, mm6, mm7, mm0, mm1, 48)
  P( mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm0, 56)
  !add ecx, 64
  !cmp ecx, 640
  !jne sigma23Loop
  
  ADD_RESULT(mm0, 0)
  ADD_RESULT(mm1, 8)
  ADD_RESULT(mm2, 16)
  ADD_RESULT(mm3, 24)
  ADD_RESULT(mm4, 32)
  ADD_RESULT(mm5, 40)
  ADD_RESULT(mm6, 48)
  ADD_RESULT(mm7, 56)
    
  !emms
  !add esp, 656
  !pop edi
  !pop esi
  !ret
EndProcedure

PrototypeC sha512_process_proto(*ctx.sha512_context, *bytes.UINT64_BUFFER)
Global sha512_process.sha512_process_proto = sha512_process_addr__()

Procedure shaQuad_update( *ctx.sha512_context, *input, length, jobsize.q, *callback )
  
  Static totalprocessed.q = 0
  
  Define.l left, fill
  
  If length=0 : ProcedureReturn : EndIf
  
  left = *ctx\total & 127
  fill = 128 - left
  
  *ctx\total + length
  
  If left And (length >= fill)
    CopyMemory( *input, @*ctx\buffer[0]+left, fill )
    sha512_process( *ctx, @*ctx\buffer[0] )
    length - fill
    *input + fill
    left = 0
  EndIf
  
  While length >= 128 
    sha512_process( *ctx, *input )
    length - 128
    *input + 128
    totalprocessed + 128
    
    If *callback And totalprocessed & $ffff = 0
      CallFunctionFast(*callback, 100 * totalprocessed / jobsize )
    EndIf
    
  Wend
  
  If length 
    CopyMemory( *input, @*ctx\buffer[0]+left, length )
  EndIf
  
EndProcedure

Procedure shaQuad_finish( *ctx.sha512_context, *digest.UINT64_BUFFER, jobsize.q, full, *callback )
  
  Define.l last, padn
  msglen.msglen
  sha512_padding.UINT8_BUFFER
  sha512_padding\b[0]=$80
  
  msglen\highpart = ChangeEndian64(jobsize<<3)
  
  last = *ctx\total & 127
  If last<112
    padn = 112-last
  Else
    padn = 240-last
  EndIf
  
  shaQuad_update( *ctx, @sha512_padding, padn, padn, *callback )
  shaQuad_update( *ctx, @msglen, 16, 16, *callback )
  
  For i = 0 To 7
    *digest\w[i] = ChangeEndian64(*ctx\state[i])
  Next
  
  If *callback
    CallFunctionFast(*callback, 100)
  EndIf
  
EndProcedure

;================================================================
;                  EXPORTED FUNCTIONS
;================================================================

Procedure.s shaQuadFingerprint(*datapointer, Length, full, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  
  *ctx.sha512_context = AllocateMemory(SizeOf(sha512_context))
  *sha512sum = AllocateMemory(64)
  
  Protected digest.s
  
  If full
    sha512_starts( *ctx )
  Else
    sha384_starts( *ctx )
  EndIf
  
  shaQuad_update( *ctx, *datapointer, Length, Length, *callback )
  shaQuad_finish( *ctx, *sha512sum, Length, full, *callback )
  
  If full
    iterations = 64
  Else
    iterations = 48
  EndIf
  
  *output = AllocateMemory(128)
  digest = PeekS(HexBytes(*sha512sum, iterations, *output), iterations << 1, #PB_Ascii)
  FreeMemory(*output)
  
  FreeMemory(*sha512sum)
  FreeMemory(*ctx)
  
  ProcedureReturn digest
  
EndProcedure

ProcedureDLL.s sha512Fingerprint(*datapointer, Length, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFingerprint(*datapointer, Length, 1, *callback)
EndProcedure

ProcedureDLL.s sha384Fingerprint(*datapointer, Length, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFingerprint(*datapointer, Length, 0, *callback)
EndProcedure

Procedure.s shaQuadFileFingerprint(filename.s, full, *callback=0) ; filename$, [ ,<procaddress> ]
  
  Protected digest.s
  Protected *datapointer, *output, *ctx.sha512_context
  Protected bytesread, fresult
  Protected jobsize.q
  
  *ctx         = AllocateMemory(SizeOf(sha512_context))
  *sha512sum   = AllocateMemory(64)
  *datapointer = AllocateMemory(4096)
  fresult      = OpenFile(#PB_Any, filename)
  
  If fresult
    jobsize = Lof(fresult)
    
    If full
      sha512_starts( *ctx )
    Else
      sha384_starts( *ctx )
    EndIf
    
    While Not Eof(fresult)
      bytesread = ReadData(fresult, *datapointer, 4096)
      shaQuad_update( *ctx, *datapointer, bytesread, jobsize, *callback )
    Wend
    shaQuad_finish( *ctx, *sha512sum, jobsize, full, *callback )
    
    If full
      iterations = 64
    Else
      iterations = 48
    EndIf
    
    *output = AllocateMemory(128)
    digest = PeekS(HexBytes(*sha512sum, iterations, *output), iterations << 1, #PB_Ascii)
    FreeMemory(*output)
    
    CloseFile(fresult)
  EndIf
  
  FreeMemory(*datapointer)
  FreeMemory(*sha512sum)
  FreeMemory(*ctx)
  
  ProcedureReturn digest
  
EndProcedure

ProcedureDLL.s sha512FileFingerprint(filename.s, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFileFingerprint(filename.s, 1, *callback)
EndProcedure

ProcedureDLL.s sha384FileFingerprint(filename.s, *callback=0) ; Data address, data size, [ ,<procaddress> ]
  ProcedureReturn shaQuadFileFingerprint(filename.s, 0, *callback)
EndProcedure