Page 1 of 2

ChaCha cipher module

Posted: Wed Aug 05, 2015 5:51 pm
by wilbert
Should be pretty fast and secure. :)
Hopefully everything works. Use at your own risk :wink:

If you need more speed, you can consider using ChaCha12 (set rounds to 12).
It's not as secure as ChaCha20 but still pretty good.

Code: Select all

; ChaCha module by Wilbert (SSE2 required)

; last update August 6, 2015

; algorithm by D. J. Bernstein (Public domain)


DeclareModule ChaCha
  
  Structure ctx_ChaCha
    input.l[16]
    drounds.l
  EndStructure
  
  Declare SetKey(*ctx.ctx_ChaCha, *Key.Ascii, KeySize = 256, Rounds = 20)
  Declare SetAsciiKey(*ctx.ctx_ChaCha, Key.s, Rounds = 20)
  Declare SetIV(*ctx.ctx_ChaCha, *IV.Ascii, Counter.q = 0)
  Declare Crypt(*ctx.ctx_ChaCha, *Input, *Output, Size)
  Declare CryptPacket(*ctx.ctx_ChaCha, *IV.Ascii, *Input, *Output, Size)
  Declare CryptFile(*ctx.ctx_ChaCha, *IV.Ascii, InputFile.s, OutputFile.s)
  
EndDeclareModule

Module ChaCha
  
  EnableASM
  EnableExplicit
  DisableDebugger
  
  ; *** Macros and DataSection ***
  
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
    Macro rax : eax : EndMacro
    Macro rbx : ebx : EndMacro   
    Macro rcx : ecx : EndMacro
    Macro rdx : edx : EndMacro
    Macro rsi : esi : EndMacro
    Macro rdi : edi : EndMacro
    Macro rsp : esp : EndMacro
  CompilerEndIf
  
  Macro M_movdqa(arg1, arg2)
    !movdqa arg1, arg2
  EndMacro
  
  Macro M_movdqu(arg1, arg2)
    !movdqu arg1, arg2
  EndMacro
  
  Macro M_movq(arg1, arg2)
    !movq arg1, arg2
  EndMacro
  
  Macro M_paddd(arg1, arg2)
    !paddd arg1, arg2
  EndMacro
  
  Macro M_CryptQR(reg0, reg1, reg2, lr)
    !paddd reg0, reg1
    !pxor reg2, reg0
    CompilerIf lr = 16
      !pshuflw reg2, reg2, 10110001b
      !pshufhw reg2, reg2, 10110001b
    CompilerElse
      !movdqa xmm4, reg2
      !pslld reg2, lr
      !psrld xmm4, 32-lr
      !por reg2, xmm4
    CompilerEndIf
  EndMacro
  
  Macro M_CryptXor(reg)
    sub rbx, 16
    !movdqa xmm4, reg
    !jc chacha.l_crypt2
    CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
      !movdqu xmm4, [esi]
      !pxor xmm4, reg
      !movdqu [edi], xmm4
    CompilerElse
      !movdqu xmm4, [rsi]
      !pxor xmm4, reg
      !movdqu [rdi], xmm4
    CompilerEndIf
    add rsi, 16
    add rdi, 16
  EndMacro
  
  DataSection
    !chacha.l_sigma: db 'expand 32-byte k'
    !chacha.l_tau: db 'expand 16-byte k'
  EndDataSection
  
  
  ; *** SetKey procedure ***
  ; Key has to be 128 or 256 bits
  
  Procedure SetKey(*ctx.ctx_ChaCha, *Key.Ascii, KeySize = 256, Rounds = 20)
    
    mov rdx, [p.p_ctx]
    mov rax, [p.p_Key]
    mov rcx, [p.v_KeySize]
    cmp rcx, 256
    M_movdqu(xmm1, [rax])
    !pxor xmm3, xmm3
    !jne chacha.l_key0
    
    ; 256 bit key
    M_movdqu(xmm2, [rax + 16])
    lea rax, [chacha.l_sigma]
    !jmp chacha.l_key1
    
    ; 128 bit key
    !chacha.l_key0:
    !movdqa xmm2, xmm1
    lea rax, [chacha.l_tau]
    
    !chacha.l_key1:
    M_movdqu(xmm0, [rax])
    mov rcx, [p.v_Rounds]
    shr rcx, 1
    M_movdqu([rdx], xmm0)
    M_movdqu([rdx + 16], xmm1)
    M_movdqu([rdx + 32], xmm2)
    M_movdqu([rdx + 48], xmm3)
    mov [rdx + 64], ecx
    
  EndProcedure
  
  
  ; *** SetAsciiKey procedure ***
  
  Procedure SetAsciiKey(*ctx.ctx_ChaCha, Key.s, Rounds = 20)
    
    Protected.i pos
    Protected Dim k.a(32)
    
    Repeat
      pos + PokeS(@k(pos), Key, 32 - pos, #PB_Ascii)
    Until pos = 32
    
    SetKey(*ctx, @k(), 256, Rounds)
    
  EndProcedure
  
  
  ; *** SetIV procedure ***
  ; IV (initialization vector) has to be 64 bits  
  
  Procedure SetIV(*ctx.ctx_ChaCha, *IV.Ascii, Counter.q = 0)
    
    mov rdx, [p.p_ctx]
    mov rax, [p.p_IV]
    M_movq(xmm3, [p.v_Counter])
    M_movq(xmm4, [rax])
    !punpcklqdq xmm3, xmm4
    M_movdqu([rdx + 48], xmm3)
    
  EndProcedure
  
  
  
  ; *** Crypt procedure ***
  ; returns 0 on error, 1 on success
  
  Procedure Crypt(*ctx.ctx_ChaCha, *Input, *Output, Size)
    
    ; backup registers
    mov [rsp - 8], rbx
    mov [rsp - 16], rsi
    mov [rsp - 24], rdi
    
    ; load procedure parameters
    sub rax, rax
    mov rbx, [p.v_Size]
    test rbx, rbx
    !jz chacha.l_crypt5
    mov rdx, [p.p_ctx]
    mov ecx, [rdx + 64]
    !cmp ecx, 0
    !jng chacha.l_crypt5
    mov rsi, [p.p_Input]
    mov rdi, [p.p_Output]
    
    ; load state and make aligned copy
    lea rax, [rsp - 88]
    shr rax, 4
    shl rax, 4
    M_movdqu(xmm0, [rdx])
    M_movdqu(xmm1, [rdx + 16])
    M_movdqu(xmm2, [rdx + 32])
    M_movdqu(xmm3, [rdx + 48])
    M_movdqa([rax], xmm0)
    M_movdqa([rax + 16], xmm1)
    M_movdqa([rax + 32], xmm2)
    M_movdqa([rax + 48], xmm3)
    
    ; perform double rounds
    !mov ecx, 1
    !movd xmm5, ecx
    !chacha.l_crypt0:
    mov ecx, [rdx + 64]
    !chacha.l_crypt1:
    ; xmm0 = 3 2 1 0
    ; xmm1 = 7 6 5 4
    ; xmm2 = 11 10 9 8
    ; xmm3 = 15 14 13 12
    M_CryptQR(xmm0, xmm1, xmm3, 16)
    M_CryptQR(xmm2, xmm3, xmm1, 12)
    M_CryptQR(xmm0, xmm1, xmm3, 8)
    M_CryptQR(xmm2, xmm3, xmm1, 7)
    !pshufd xmm1, xmm1, 00111001b
    !pshufd xmm2, xmm2, 01001110b
    !pshufd xmm3, xmm3, 10010011b
    ; xmm1 = 4 7 6 5
    ; xmm2 = 9 8 11 10
    ; xmm3 = 14 13 12 15
    M_CryptQR(xmm0, xmm1, xmm3, 16)
    M_CryptQR(xmm2, xmm3, xmm1, 12)
    M_CryptQR(xmm0, xmm1, xmm3, 8)
    M_CryptQR(xmm2, xmm3, xmm1, 7)
    !pshufd xmm1, xmm1, 10010011b
    !pshufd xmm2, xmm2, 01001110b
    !pshufd xmm3, xmm3, 00111001b
    !sub ecx, 1
    !jnz chacha.l_crypt1
    
    ; add
    M_movdqa(xmm4, [rax + 48])
    !paddd xmm3, xmm4
    !paddq xmm4, xmm5; block += 1
    M_movdqa([rax + 48], xmm4)
    M_paddd(xmm2, [rax + 32])
    M_paddd(xmm1, [rax + 16])
    M_paddd(xmm0, [rax])
    
    ; xor
    M_CryptXor(xmm0)
    M_CryptXor(xmm1)
    M_CryptXor(xmm2)
    M_CryptXor(xmm3)
    test rbx, rbx
    !jz chacha.l_crypt4
    M_movdqa(xmm0, [rax])
    M_movdqa(xmm1, [rax + 16])
    M_movdqa(xmm2, [rax + 32])
    M_movdqa(xmm3, [rax + 48])
    !jmp chacha.l_crypt0
    !chacha.l_crypt2:
    add rbx, 16
    !jz chacha.l_crypt4
    M_movdqa([rax], xmm4)
    !chacha.l_crypt3:
    movzx ecx, byte [rax + rbx - 1]
    XOr cl, [rsi + rbx - 1]
    mov [rdi + rbx - 1], cl
    sub rbx, 1
    !jnz chacha.l_crypt3
    
    ; update block counter in state
    !chacha.l_crypt4:
    M_movq(xmm3, [rax + 48])
    M_movq([rdx + 48], xmm3)
    mov rax, 1
    
    ; restore registers
    !chacha.l_crypt5:
    mov rbx, [rsp - 8]
    mov rsi, [rsp - 16]
    mov rdi, [rsp - 24]
    ProcedureReturn
    
  EndProcedure
  
  
  ; *** CryptPacket procedure ***
  ; returns 0 on error, 1 on success
  
  Procedure CryptPacket(*ctx.ctx_ChaCha, *IV.Ascii, *Input, *Output, Size)
    SetIV(*ctx, *IV)
    ProcedureReturn Crypt(*ctx, *Input, *Output, Size)
  EndProcedure
  
  
  ; *** CryptFile procedure ***
  ; returns 0 on error, 1 on success
  ; OutputFile is overwritten !!!
  
  Procedure CryptFile(*ctx.ctx_ChaCha, *IV.Ascii, InputFile.s, OutputFile.s)
    
    Protected.i result, inFile, outFile, pos, nBytes
    Protected Dim Buffer.l(8191)
    
    SetIV(*ctx, *IV)
    If InputFile = OutputFile
      inFile = OpenFile(#PB_Any, InputFile)
      outFile = inFile
    Else
      inFile = ReadFile(#PB_Any, InputFile)
      outFile = CreateFile(#PB_Any, OutputFile)
    EndIf
    
    If inFile And outFile
      Repeat
        pos = Loc(inFile)
        nBytes = ReadData(inFile, @Buffer(), 32768)
        If nBytes
          FileSeek(outFile, pos)
          result = Crypt(*ctx, @Buffer(), @Buffer(), nBytes)
          WriteData(outFile, @Buffer(), nBytes)  
        EndIf
      Until nBytes = 0
    EndIf
    
    If IsFile(inFile) : CloseFile(inFile) : EndIf
    If IsFile(outFile) : CloseFile(outFile) : EndIf
    
    ProcedureReturn result
    
  EndProcedure
  
EndModule

Re: ChaCha module

Posted: Wed Aug 05, 2015 5:52 pm
by wilbert
Examples

Code: Select all

DataSection
  key:
  Data.b $c4,$6e,$c1,$b1,$8c,$e8,$a8,$78
  Data.b $72,$5a,$37,$e7,$80,$df,$b7,$35
  Data.b $1f,$68,$ed,$2e,$19,$4c,$79,$fb
  Data.b $c6,$ae,$be,$e1,$a6,$67,$97,$5d
  iv:
  Data.b $1a,$da,$31,$d5,$cf,$68,$82,$21
EndDataSection

ctx.ChaCha::ctx_ChaCha
ChaCha::SetKey(@ctx, ?key)

t1 = ElapsedMilliseconds()
ChaCha::CryptFile(@ctx, ?iv, "test.jpg", "test_.jpg")
t2 = ElapsedMilliseconds()
MessageRequester("",Str(t2-t1))

Code: Select all

; create context and set key
ctx.ChaCha::ctx_ChaCha
ChaCha::SetAsciiKey(@ctx, "PureBasic ChaCha")

; set 64 bit initialization vector
RandomSeed(123)
RandomData(@iv.q, 8)

MyString.s = "This is a small test string"
MyStringLen = StringByteLength(MyString)

; encrypt
*Encrypted = AllocateMemory(MyStringLen)
ChaCha::CryptPacket(@ctx, @iv, @MyString, *Encrypted, MyStringLen)
ShowMemoryViewer(*Encrypted, MyStringLen)

; decrypt
ChaCha::CryptPacket(@ctx, @iv, *Encrypted, @MyString, MyStringLen)

Debug MyString

Re: ChaCha module

Posted: Wed Aug 05, 2015 9:34 pm
by netmaestro
Looks excellent and thanks for sharing. Good clean code as always. One question, are zeros going to appear in the encrypted output? With AES we have to encrypt->base64->encrypted string if we want to show the result as a string, I'm just wondering is that necessary with this too?

Re: ChaCha module

Posted: Wed Aug 05, 2015 9:39 pm
by idle
nice, will be interesting to see how it compares to AES for speed

Re: ChaCha module

Posted: Wed Aug 05, 2015 10:24 pm
by IdeasVacuum
Very very interesting!
I nearly didn't bother to read this post though, thought it was to do with ballroom dancing... :mrgreen:

Re: ChaCha module

Posted: Wed Aug 05, 2015 11:54 pm
by Keya
wilbert she is really a beautiful elegant cipher isnt she! we learn a little about her at uni this year. Thankyou for your share :)
I saw this page https://eden.dei.uc.pt/~sneves/chacha/chacha.html
he change the following code to gain a speed improvement from 3.9 to 3.18 seconds/6146 to 7555 Mbps:

Code: Select all

movdqa %xmm15,%xmm6
psrld $16,%xmm15
pslld $16,%xmm6
pxor %xmm6,%xmm15
   ... to ...
pshufb %xmm6, %xmm15
i look at your code but its hard for me to tell but it looks like maybe you are doing it in this bit (and you do por where he does pxor?):

Code: Select all

  Macro M_CryptQR(reg0, reg1, reg2, lr)
    !paddd reg0, reg1
    !pxor reg2, reg0
    CompilerIf lr = 16
      !pshuflw reg2, reg2, 10110001b
      !pshufhw reg2, reg2, 10110001b
    CompilerElse
      !movdqa xmm4, reg2
      !pslld reg2, lr
      !psrld xmm4, 32-lr
      !por reg2, xmm4
    CompilerEndIf
  EndMacro
But the article suggests you can do it for both 8 and 16, your code if it is doing it is only doing it for 16?

Now all it needs is Poly1305 for authentication! lol :D

Re: ChaCha module

Posted: Thu Aug 06, 2015 5:45 am
by wilbert
netmaestro wrote:One question, are zeros going to appear in the encrypted output? With AES we have to encrypt->base64->encrypted string if we want to show the result as a string, I'm just wondering is that necessary with this too?
Yes, zeros are going to appear in the encrypted output with this one also.
Keya wrote:I saw this page https://eden.dei.uc.pt/~sneves/chacha/chacha.html
he change the following code to gain a speed improvement from 3.9 to 3.18 seconds/6146 to 7555 Mbps:
I noticed that page also. It's true I only optimized for rotating 16 bits.
The reason for this, is that I wanted to stick with SSE2. Pshufb is a SSSE3 instruction.
I also wanted code that worked both on 32 and 64 bit systems so I also didn't use the extra registers 64 bit has.
When using newer instruction sets like for example AVX or AVX2, you can improve the speed of course.
My code isn't as fast as the one referenced there by the way (the referenced code processes 4 blocks in parallel while I'm handling only 1 block at a time).
idle wrote:nice, will be interesting to see how it compares to AES for speed
If you know the answer, I'd like to hear it :)

Re: ChaCha cipher module

Posted: Thu Aug 06, 2015 8:10 am
by Inf0Byt3
Nice code Wilbert, thanks! It's lightning fast too.

Re: ChaCha cipher module

Posted: Thu Aug 06, 2015 8:51 pm
by Erich
Very cool. 8)

Did you check the implementation with test vectors?

Re: ChaCha cipher module

Posted: Thu Aug 06, 2015 8:53 pm
by netmaestro

Code: Select all

CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
    Macro rax : eax : EndMacro
    Macro rbx : ebx : EndMacro   
    Macro rcx : ecx : EndMacro
    Macro rdx : edx : EndMacro
    Macro rsi : esi : EndMacro
    Macro rdi : edi : EndMacro
    Macro rsp : esp : EndMacro
  CompilerEndIf
Ok, now you're just showing off!!

(kidding aside, very clever)

Re: ChaCha cipher module

Posted: Thu Aug 06, 2015 9:43 pm
by wilbert
Erich wrote:Very cool. 8)

Did you check the implementation with test vectors?
Yes, here are some test vectors if you want to verify
https://tools.ietf.org/html/draft-strom ... vectors-00
I didn't implement a special keystream procedure but you can just use a memory area with all zero's for both input and output.
The output should match the test vectors if you used the same key and iv.
netmaestro wrote:Ok, now you're just showing off!!

(kidding aside, very clever)
If I remember correctly I got this approach from somewhere on this forum.
Anyway, it works great to create code compatible with both x86 and x64.

Re: ChaCha cipher module

Posted: Thu Aug 06, 2015 10:21 pm
by Tenaja
wilbert wrote:
netmaestro wrote:Ok, now you're just showing off!!

(kidding aside, very clever)
If I remember correctly I got this approach from somewhere on this forum.
Anyway, it works great to create code compatible with both x86 and x64.
http://purebasic.fr/english/viewtopic.php?f=35&t=60280

I tried to get it to work, and came up with a dirtier solution, but Stargate made it clean.

Re: ChaCha module

Posted: Fri Aug 07, 2015 1:14 am
by RichAlgeni
IdeasVacuum wrote:Very very interesting!
I nearly didn't bother to read this post though, thought it was to do with ballroom dancing... :mrgreen:
But you're a beautiful dancer! :lol:

Re: ChaCha cipher module

Posted: Wed Apr 06, 2022 5:57 am
by netmaestro
I got an email telling me there was a new reply in this topic but there isn't. I'm glad I got it though because I'd forgotten this little gem. Maybe some spammer got squashed before I got here?

Re: ChaCha cipher module

Posted: Thu May 05, 2022 5:39 am
by pdwyer
Thanks for the bump, I hadn't seen this at all.
Computerfile has a good vid on this algo for those interested https://www.youtube.com/watch?v=UeIpq-C-GSA