AsmLib - Optimized C Library

Share your advanced PureBasic knowledge/code with the community.
Toni6
User
User
Posts: 45
Joined: Mon Apr 23, 2012 1:39 pm

AsmLib - Optimized C Library

Post by Toni6 »

Asmlib is a function library to call from C or C++ and now PB for all x86 and x86-64 platforms. It is not
intended to be a complete function library, but contains mainly:

• Faster versions of several standard C functions
• Useful functions that are difficult to find elsewhere
• Functions that are best written in assembly language
• Efficient random number generators

These functions are written in assembly language for the sake of optimizing speed. Many of
the functions have multiple branches for different instruction sets, such as SSE2, SSE4.2,
AVX, AVX2, etc. These functions will automatically detect which instruction set is supported
by the computer it is running on and select the optimal branch.
This library is also intended as a showcase to illustrate the optimization methods explained
in my optimization manuals and an example of how to make a cross-platform function
library.

The latest version of asmlib is always available at http://www.agner.org/optimize.
AsmLib: http://www.agner.org/optimize/asmlib.zip

Code: Select all

EnableExplicit

CompilerIf #PB_Compiler_Unicode And #PB_Compiler_Debugger
  Debug "ASMLIB: In Unicode mode you can't use the asmlib string functions with pb strings"
CompilerEndIf

CompilerIf Defined(AGNERFOG_ASMLIB_H, #PB_Constant) = 0
#AGNERFOG_ASMLIB_H = 1

;- NOTES
;
; DEFINE #ASMLIB_OVERRIDE_STANDARD_LIBRARY to Override the standard library with this optimized functions
;

IncludePath #PB_Compiler_FilePath

CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
  
  CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
    #ASMLIB_LIB = #PB_Compiler_FilePath + "\asmlib\libacof32o.lib"
  CompilerElse
    #ASMLIB_LIB = #PB_Compiler_FilePath + "\asmlib\libacof32.lib"
  CompilerEndIf
  
CompilerElse ; x64
  
  CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
    #ASMLIB_LIB = #PB_Compiler_FilePath + "\asmlib\libacof64o.lib"
  CompilerElse
    #ASMLIB_LIB = #PB_Compiler_FilePath + "\asmlib\libacof64.lib"
  CompilerEndIf
  
CompilerEndIf


CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
  ImportC "/DEFAULTLIB:"+#ASMLIB_LIB
  EndImport
  
  #ASMLIB_IMP_FILE = ""
CompilerElse
  #ASMLIB_IMP_FILE = #ASMLIB_LIB
CompilerEndIf

CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
  
ImportC #ASMLIB_IMP_FILE ; CDECL
  
CompilerElse ; x64 - FASTCALL
  
Import #ASMLIB_IMP_FILE
    
CompilerEndIf

  A_memcpy.i(*dest, *src, count.i)                      ; Copy count bytes from src to dest
  A_memmove.i(*dest, *src, count.i)                     ; Same as memcpy, allows overlap between src and dest
  A_memset.i(*dest, c.l, count.i)                       ; Set count bytes in dest to (char)c
  A_memcmp.l(*buf1, *buf2, num.i)                       ; Compares two blocks of memory
  
  GetMemcpyCacheLimit.i()                               ; Data blocks bigger than this will be copied uncached by memcpy and memmove
  SetMemcpyCacheLimit(Value.i)                          ; Change limit in GetMemcpyCacheLimit
  GetMemsetCacheLimit.i()                               ; Data blocks bigger than this will be stored uncached by memset
  SetMemsetCacheLimit(Value.i)                          ; Change limit in GetMemsetCacheLimit
  
  A_strcat.i(*dest, *src)                               ; returns (char*) - Concatenate strings dest and src. Store result in dest
  A_strcpy.i(*dest, *src)                               ; returns (char*) - Copy string src to dest
  A_strlen.i(*str)                                      ; Get length of zero-terminated string
  A_strcmp.l(*a, *b)                                    ; Compare strings. Case sensitive
  A_stricmp.l(*string1, *string2)                       ; Compare strings. Case insensitive for A-Z only
  A_strstr.i(*haystack, *needle)                        ; returns (char*) - Search for substring in string
  A_strtolower(*string)                                 ; Convert string To lower Case For A-Z only
  A_strtoupper(*string)                                 ; Convert string to upper case for a-z only
  A_substring.i(*dest, *source, pos.i, len.i)           ; Copy a substring for source into dest
  A_strspn.i(*str, *set)                                ; Find span of characters that belong to set
  A_strcspn.i(*str, *set)                               ; Find span of characters that don't belong to set
  strCountInSet.i(*str, *set)                           ; Count characters that belong to set
  strcount_UTF8.i(*str)                                 ; Counts the number of characters in a UTF-8 encoded string
  
  ; miscellaneous functions
  A_popcount.l(x.l)                                     ; Count 1-bits in 32-bit integer
  RoundD.l(x.d)                                         ; Round to nearest or even
  RoundF.l(x.f)                                         ; Round to nearest or even
  InstructionSet.l()                                    ; Tell which instruction set is supported
  ProcessorName.i()                                     ; ASCIIZ text describing microprocessor
  CpuType(*vendor.LONG, *family.LONG, *model.LONG)      ; Get CPU vendor, family and model
  DataCacheSize.i(level.l)                              ; Get size of data cache
  A_DebugBreak()                                        ; Makes a debug breakpoint
  ReadTSC.i()                                           ; Read microprocessor internal clock (only 32 bits supported by compiler)
  cpuid_ex(*abcd, _eax.l, _ecx)                         ; call CPUID instruction
  
  ; integer division functions
  ; not done...
  
EndImport

CompilerEndIf
Example:

Code: Select all

#ASMLIB_OVERRIDE_STANDARD_LIBRARY = 1

#TEST_OPTIMIZATIONS = 1

CompilerIf #TEST_OPTIMIZATIONS = 1
XIncludeFile "asmlib.pbi"

A_strcmp(@"", @"")
CompilerEndIf

Define *buf1, *buf2
Define Time.l, EndTime.l
Define i.l

*buf1 = AllocateMemory(2048)
*buf2 = AllocateMemory(2048)

RandomData(*buf1, 2048)
RandomData(*buf2, 2048)

Time = timeGetTime_()

For i.l = 0 To 5000000
  CopyMemory(*buf1, *buf2, 2048)
Next i

EndTime = timeGetTime_() - Time

Debug EndTime
Poshu
Enthusiast
Enthusiast
Posts: 459
Joined: Tue Jan 25, 2005 7:01 pm
Location: Canada

Re: AsmLib - Optimized C Library

Post by Poshu »

I just read the documentation and can't refrain myself from crying in joy while thinking of a much faster string system.
Thanks a lot, gonna toy with it.
kinglestat
Enthusiast
Enthusiast
Posts: 746
Joined: Fri Jul 14, 2006 8:53 pm
Location: Malta
Contact:

Re: AsmLib - Optimized C Library

Post by kinglestat »

If you truly want a fast string library take a look at cieve which I wrote some years back. Should be still in the forum and I still use.
I may not help with your coding
Just ask about mental issues!

http://www.lulu.com/spotlight/kingwolf
http://www.sen3.net
coco2
Enthusiast
Enthusiast
Posts: 461
Joined: Mon Nov 25, 2013 5:38 am
Location: Australia

Re: AsmLib - Optimized C Library

Post by coco2 »

I ran the test which moves 2048 bytes in memory 5 million times

Milliseconds:
1368 without
1171 with

Seems legit
kinglestat wrote:If you truly want a fast string library take a look at cieve which I wrote some years back. Should be still in the forum and I still use.
I can't find it... you sure you posted it?
User avatar
Demivec
Addict
Addict
Posts: 4270
Joined: Mon Jul 25, 2005 3:51 pm
Location: Utah, USA

Re: AsmLib - Optimized C Library

Post by Demivec »

coco2 wrote:
kinglestat wrote:If you truly want a fast string library take a look at cieve which I wrote some years back. Should be still in the forum and I still use.
I can't find it... you sure you posted it?
Here is the thread.
User avatar
Tenaja
Addict
Addict
Posts: 1959
Joined: Tue Nov 09, 2010 10:15 pm

Re: AsmLib - Optimized C Library

Post by Tenaja »

kinglestat wrote:If you truly want a fast string library take a look at cieve which I wrote some years back. Should be still in the forum and I still use.
I was getting excited about downloading Ceive as I read your description of it. Then I saw it was Windows only. Bummer. And not open source. Oh, well, maybe next time.

Nothing personal, I just don't use libraries that are closed. I have been burned too many times...hmmm...almost every time.
kinglestat
Enthusiast
Enthusiast
Posts: 746
Joined: Fri Jul 14, 2006 8:53 pm
Location: Malta
Contact:

Re: AsmLib - Optimized C Library

Post by kinglestat »

Fair enough
I may not help with your coding
Just ask about mental issues!

http://www.lulu.com/spotlight/kingwolf
http://www.sen3.net
Poshu
Enthusiast
Enthusiast
Posts: 459
Joined: Tue Jan 25, 2005 7:01 pm
Location: Canada

Re: AsmLib - Optimized C Library

Post by Poshu »

Tenaja wrote:I was getting excited about downloading Ceive as I read your description of it. Then I saw it was Windows only. Bummer. And not open source. Oh, well, maybe next time.

Nothing personal, I just don't use libraries that are closed. I have been burned too many times...hmmm...almost every time.
Same here, with the addition that I'm not a windows user.
User avatar
Keya
Addict
Addict
Posts: 1890
Joined: Thu Jun 04, 2015 7:10 am

Re: AsmLib - Optimized C Library

Post by Keya »

Toni6 thankyou very much for your share! Ive made a couple changes to your ASMLIB.PBI include to add support for Linux and Mac OSX, it seems to be ok

asmlib is GPL though so if im understanding correctly you can only use it if you open-source your program? :(

Code: Select all

EnableExplicit

CompilerIf #PB_Compiler_Unicode And #PB_Compiler_Debugger
  Debug "ASMLIB: In Unicode mode you can't use the asmlib string functions with pb strings"
CompilerEndIf

CompilerIf Defined(AGNERFOG_ASMLIB_H, #PB_Constant) = 0
  #AGNERFOG_ASMLIB_H = 1
  ;- NOTES
  ; DEFINE #ASMLIB_OVERRIDE_STANDARD_LIBRARY to Override the standard library with this optimized functions

  IncludePath #PB_Compiler_FilePath
  
  #ASMLIB_Path = #PB_Compiler_FilePath
  
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
    #ASMLIB_CPU = "32"
  CompilerElse
    #ASMLIB_CPU = "64"
  CompilerEndIf
  
  CompilerIf #PB_Compiler_OS = #PB_OS_Windows
    CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
      #ASMLIB_LIB = #ASMLIB_Path + "asmlib\libacof"+#ASMLIB_CPU+"o.lib"
    CompilerElse
      #ASMLIB_LIB = #ASMLIB_Path + "asmlib\libacof"+#ASMLIB_CPU+".lib"
    CompilerEndIf
  CompilerElseIf #PB_Compiler_OS = #PB_OS_Linux
    CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
      #ASMLIB_LIB = #ASMLIB_Path + "asmlib/libaelf"+#ASMLIB_CPU+"o.a"
    CompilerElse
      #ASMLIB_LIB = #ASMLIB_Path + "asmlib/libaelf"+#ASMLIB_CPU+".a"
    CompilerEndIf
  CompilerElseIf #PB_Compiler_OS = #PB_OS_MacOS
    CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
      #ASMLIB_LIB = #ASMLIB_Path + "asmlib/libamac"+#ASMLIB_CPU+"o.a"
    CompilerElse
      #ASMLIB_LIB = #ASMLIB_Path + "asmlib/libamac"+#ASMLIB_CPU+".a"
    CompilerEndIf
  CompilerEndIf
  
  CompilerIf Defined(ASMLIB_OVERRIDE_STANDARD_LIBRARY, #PB_Constant) = 1
    CompilerIf #PB_Compiler_OS = #PB_OS_Windows
      ImportC "/DEFAULTLIB:"+#ASMLIB_LIB
      EndImport    
    CompilerElse
      ImportC #ASMLIB_LIB
      EndImport    
    CompilerEndIf
    #ASMLIB_IMP_FILE = ""
  CompilerElse
    #ASMLIB_IMP_FILE = #ASMLIB_LIB
  CompilerEndIf
  
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x86   
    ImportC #ASMLIB_IMP_FILE ; CDECL
  CompilerElse ; x64 - FASTCALL      
    Import #ASMLIB_IMP_FILE
  CompilerEndIf
      
  A_memcpy.i(*dest, *src, count.i)                      ; Copy count bytes from src to dest
  A_memmove.i(*dest, *src, count.i)                     ; Same as memcpy, allows overlap between src and dest
  A_memset.i(*dest, c.l, count.i)                       ; Set count bytes in dest to (char)c
  A_memcmp.l(*buf1, *buf2, num.i)                       ; Compares two blocks of memory
  
  GetMemcpyCacheLimit.i()                               ; Data blocks bigger than this will be copied uncached by memcpy and memmove
  SetMemcpyCacheLimit(Value.i)                          ; Change limit in GetMemcpyCacheLimit
  GetMemsetCacheLimit.i()                               ; Data blocks bigger than this will be stored uncached by memset
  SetMemsetCacheLimit(Value.i)                          ; Change limit in GetMemsetCacheLimit
  
  A_strcat.i(*dest, *src)                               ; returns (char*) - Concatenate strings dest and src. Store result in dest
  A_strcpy.i(*dest, *src)                               ; returns (char*) - Copy string src to dest
  A_strlen.i(*str)                                      ; Get length of zero-terminated string
  A_strcmp.l(*a, *b)                                    ; Compare strings. Case sensitive
  A_stricmp.l(*string1, *string2)                       ; Compare strings. Case insensitive for A-Z only
  A_strstr.i(*haystack, *needle)                        ; returns (char*) - Search for substring in string
  A_strtolower(*string)                                 ; Convert string To lower Case For A-Z only
  A_strtoupper(*string)                                 ; Convert string to upper case for a-z only
  A_substring.i(*dest, *source, pos.i, len.i)           ; Copy a substring for source into dest
  A_strspn.i(*str, *set)                                ; Find span of characters that belong to set
  A_strcspn.i(*str, *set)                               ; Find span of characters that don't belong to set
  strCountInSet.i(*str, *set)                           ; Count characters that belong to set
  strcount_UTF8.i(*str)                                 ; Counts the number of characters in a UTF-8 encoded string
  
  ; miscellaneous functions
  A_popcount.l(x.l)                                     ; Count 1-bits in 32-bit integer
  RoundD.l(x.d)                                         ; Round to nearest or even
  RoundF.l(x.f)                                         ; Round to nearest or even
  InstructionSet.l()                                    ; Tell which instruction set is supported
  ProcessorName.i()                                     ; ASCIIZ text describing microprocessor
  CpuType(*vendor.LONG, *family.LONG, *model.LONG)      ; Get CPU vendor, family and model
  DataCacheSize.i(level.l)                              ; Get size of data cache
  A_DebugBreak()                                        ; Makes a debug breakpoint
  ReadTSC.i()                                           ; Read microprocessor internal clock (only 32 bits supported by compiler)
  cpuid_ex(*abcd, _eax.l, _ecx)                         ; call CPUID instruction
  
  ; integer division functions
  ; not done...
  
  EndImport
CompilerEndIf
Post Reply