https://en.wikipedia.org/wiki/Bloom_filter
Added original c fasthash for c backend
bloom items = 1,048,576 memory = 7.87 mb
Compressed size 1903041 b1858 kb 1.81 mb
Bloom get = 1048576
Bloom2 get = 1048576
Bloom3 get = 1048576
max false positives expected = 1024 actual 1033
Bloom set 26 ms
Bloom get 21 ms
map set 353 ms
map get 280 ms
Code: Select all
;High speed 64 bit Bloomfilter 
;Author Idle Andrew Ferguson
;licence MIT   
;v1.1 X64 ASM & C Backends, ARM64 PI4 and M1 C backends  
Structure bloom 
  *vt
  *filter 
  size.q
  elements.q
  hashes.q[2]
EndStructure 
Interface Ibloom 
  Set(*key,len)
  Get(*key,len) 
  GetSize()
  Or(*other.bloom)
  Compress(*len.long=0)
  Save(file.s)
  Free() 
EndInterface
Declare Bloom_Set(*this.bloom,*key,len) 
Declare Bloom_Get(*this.bloom,*key,len) 
Declare Bloom_Free(*this.bloom) 
Declare Bloom_GetSize(*this.bloom)
Declare Bloom_OR(*this.Bloom,*other.bloom) 
Declare Bloom_Compress(*this.bloom,*len.long=0) 
Declare Bloom_Save(*this.bloom,file.s) 
Declare Bloom_Decompress(*buf) 
Declare Bloom_Load(file.s) 
Declare Bloom_New(Items.i,MaxErrors.d=0.001)
DataSection : vt_bloom: 
  Data.i @Bloom_Set() 
  Data.i @Bloom_Get()
  Data.i @Bloom_GetSize()
  Data.i @Bloom_OR()
  Data.i @Bloom_Compress()
  Data.i @Bloom_Save()
  Data.i @Bloom_Free() 
EndDataSection 
CompilerIf #PB_Compiler_Backend = #PB_Backend_C  
    
  Procedure.q FastHash64(*buf,len,Seed.q=0)
    Protected result.q  
    ;FastHash64 algorithm by Zilong Tan
    !typedef unsigned long long uint64_t; 
    
    !#define mix(h) ({				      	  \
    !			(h) ^= (h) >> 23;		          \
    !			(h) *= 0x2127599bf4325c37ULL;	\
    !			(h) ^= (h) >> 47; })
    !
    
    !	const uint64_t m = 0x880355f21e6d1965ULL;
    !	const uint64_t *pos = (const uint64_t *)p_buf;
    !	const uint64_t *end = pos + (v_len / 8);
    !	const unsigned char *pos2;
    !	uint64_t h = v_seed ^ (v_len * m);
    !	uint64_t v;
    ! uint64_t result; 
    
    !	while (pos != end) {
    !		v  = *pos++;
    !		h ^= mix(v);
    !		h *= m;
    !	}
    
    !	pos2 = (const unsigned char*)pos;
    !	v = 0;
    
    !	switch (v_len & 7) {
    !	case 7: v ^= (uint64_t)pos2[6] << 48;
    !	case 6: v ^= (uint64_t)pos2[5] << 40;
    !	case 5: v ^= (uint64_t)pos2[4] << 32;
    !	case 4: v ^= (uint64_t)pos2[3] << 24;
    !	case 3: v ^= (uint64_t)pos2[2] << 16;
    !	case 2: v ^= (uint64_t)pos2[1] << 8;
    !	case 1: v ^= (uint64_t)pos2[0];
    !		h ^= mix(v);
    !		h *= m;
    !	}
    !
    !	v_result = mix(h);
    
    ProcedureReturn result 
  EndProcedure  
  
  
CompilerElse 
  
  Procedure.q FastHash64(*Buffer, Len, Seed.q=0)
    ; FastHash64 algorithm by Zilong Tan ported by wilbert
    !mov r10, 0x2127599bf4325c37
    !mov r11, 0x880355f21e6d1965
    !mov rdx, [p.p_Buffer]
    !mov rcx, [p.v_Len]
    !mov rax, rcx         ; h = seed ^ (len * m);
    !imul rax, r11
    !xor rax, [p.v_Seed]
    !sub rcx, 8
    !jc .l1
    ; 8 byte loop  
    !.l0:
    !mov r8, [rdx]        ; v = *pos++;
    !add rdx, 8
    ; -- mix(v) start --
    !mov r9, r8
    !shr r9, 23
    !xor r8, r9
    !imul r8, r10
    !mov r9, r8
    !shr r9, 47
    !xor r8, r9
    ; -- mix end --
    !xor rax, r8          ; h ^= mix(v);
    !imul rax, r11        ; h *= m;
    !sub rcx, 8
    !jnc .l0
    ; remaining bytes
    !.l1:
    !add rcx, 8
    !jz .l5
    !xor r8, r8
    !test rcx, 4
    !jz .l2
    ; get 4 bytes
    !mov r8d, [rdx]
    !add rdx, 4
    !ror r8, 32
    !.l2:
    !test rcx, 2
    !jz .l3
    ; get 2 bytes
    !movzx r9d, word [rdx]
    !add rdx, 2
    !xor r8, r9
    !ror r8, 16
    !.l3:
    !test rcx, 1
    !jz .l4
    ; get 1 byte
    !movzx r9d, byte [rdx]
    !xor r8, r9
    !ror r8, 8
    !.l4:
    !and rcx, 7
    !shl rcx, 3
    !rol r8, cl
    ; -- mix(v) start --
    !mov r9, r8
    !shr r9, 23
    !xor r8, r9
    !imul r8, r10
    !mov r9, r8
    !shr r9, 47
    !xor r8, r9
    ; -- mix end --
    !xor rax, r8          ; h ^= mix(v);
    !imul rax, r11        ; h *= m;
    ; -- mix(h) start --
    !.l5:
    !mov r9, rax
    !shr r9, 23
    !xor rax, r9
    !imul rax, r10
    !mov r9, rax
    !shr r9, 47
    !xor rax, r9
    ; -- mix end --
    ProcedureReturn       ; return mix(h);
  EndProcedure
  
CompilerEndIf 
Procedure Bloom_New(Items.i,MaxErrors.d=0.001)
  Protected *this.bloom,MaxError.d,size.i,pow.i  
  
  *this = AllocateMemory(SizeOf(bloom))
  
  If *this
    *this\vt = ?vt_bloom
    *this\elements = (-(2.0 * (items))) / (Log(1.0 - Pow(MaxErrors,0.5))) 
    *this\size = *this\elements / 8 
    *this\filter = AllocateMemory(*this\size) 
    *this\hashes[0] = 11400714819323198485
    *this\hashes[1] = 11400714819323198485 << 1
    
    ProcedureReturn *this 
  EndIf 
  
EndProcedure 
Procedure Bloom_Free(*this.bloom) 
  If *this 
    FreeMemory(*this\filter) 
    FreeMemory(*this)
    *this = 0 
  EndIf 
EndProcedure  
Procedure Bloom_GetSize(*this.bloom) 
  ProcedureReturn *this\size 
EndProcedure     
Procedure  Bloom_Set(*this.Bloom,*key,len)
  Protected hash.q,thash.q,a.i,*ta.Ascii
  Protected thash1.q
  thash1 =  FastHash64(*key,len) 
  hash =  (thash1 ! *this\hashes[0]) & $fffffffffffffff 
  thash = (thash1 ! *this\hashes[1]) & $fffffffffffffff
  hash % *this\elements
  thash % *this\elements
  *ta = *this\filter+(hash>>3)
  *ta\a | (1 << (hash & $07))
  *ta = *this\filter+(thash>>3)
  *ta\a | (1 << (thash & $07))
  ProcedureReturn thash1 
EndProcedure
Procedure Bloom_Get(*this.Bloom,*Key,len)
  Protected hash.q,thash.q,tret,retrn,a,*ta.Ascii
  Protected thash1.q,t1,t2
  thash1 =  FastHash64(*key,len) 
  hash =  (thash1 ! *this\hashes[0]) & $fffffffffffffff 
  thash = (thash1 ! *this\hashes[1]) & $fffffffffffffff 
  hash % *this\elements
  thash % *this\elements
  *ta = *this\filter+(hash>>3)
  t1 = (*ta\a & (1 << (hash & $07))) 
  *ta = *this\filter+(thash>>3)
  t2 = (*ta\a & (1 << (thash & $07)))
  If (t1 <> 0 And t2 <> 0) 
    ProcedureReturn #True
  Else
    ProcedureReturn #False
  EndIf
EndProcedure
UseLZMAPacker() 
#BLOOM_COMPPRESS = #PB_PackerPlugin_Lzma
Procedure Bloom_OR(*a.Bloom,*b.bloom)
  Protected *pa.Ascii,*pb.Ascii,a 
  
  If (*a And *b)
    
    If *a\Size <= *b\Size 
      *a\filter = ReAllocateMemory(*a\filter,*b\size)
      *a\size = *b\size  
    EndIf    
    
    *pa = *a\filter 
    *pb = *b\filter 
    
    For a = 0 To *b\size-1 
      *pa\a | *pb\a 
      *pa+1
      *pb+1
    Next 
  EndIf 
  
EndProcedure   
Procedure Bloom_Decompress(*buf) 
  Protected *this.bloom,size,size1,len  
  *this = AllocateMemory(SizeOf(bloom))
  
  size = PeekI(*buf) 
  size1 = PeekI(*buf+8)
  len = PeekI(*buf+16) 
  
  If *this
    *this\vt = ?vt_bloom
    *this\elements = len   
    *this\size = size1  
    *this\filter = AllocateMemory(*this\size) 
    *this\hashes[0] = 11400714819323198485
    *this\hashes[1] = 11400714819323198485 << 1
    
    len = UncompressMemory(*buf+24,size,*this\filter,size1,#BLOOM_COMPPRESS) 
    
    Debug *this\elements
    Debug *this\size 
    
    If len = size1 
      ProcedureReturn *this 
    EndIf 
  EndIf    
EndProcedure 
Procedure Bloom_Compress(*this.bloom,*len.long=0) 
  
  ;Debug *this\size 
  Debug "mem size filter " + Str(MemorySize(*this\filter)) 
  Debug "num elements " + Str(*this\elements) 
  Debug "size " +Str(*this\size) 
  
  Protected outsize ,*buf,len  
  outsize = *this\size + 24
  *buf = AllocateMemory(outsize) 
  len = CompressMemory(*this\filter,*this\size,*buf+24,outsize,#BLOOM_COMPPRESS,9) 
  
  Debug "Compressed Len : " + Str(len) 
  
  If len  
    PokeL(*buf,len) 
    PokeL(*buf+8,*this\size)
    PokeL(*buf+16,*this\elements) 
    If *len 
      *len\l = len+24 
    EndIf   
    ProcedureReturn *buf   
  EndIf   
  
EndProcedure 
Procedure Bloom_load(file.s) 
  
  Protected fn,bloom,*mem,len  
  fn = OpenFile(-1,file) 
  If fn 
    len = Lof(fn)
    If len 
      *mem = AllocateMemory(len) 
      ReadData(fn,*mem,len) 
      bloom = Bloom_Decompress(*mem) 
      FreeMemory(*mem) 
      CloseFile(fn) 
      ProcedureReturn bloom 
      
    EndIf 
  EndIf   
  
EndProcedure   
Procedure Bloom_Save(*this.bloom,file.s)
  Protected *buf,fn,res,len.l 
  fn = CreateFile(-1,file) 
  If fn 
    
    *buf = Bloom_Compress(*this,@len) 
    If (*buf And len > 0)  
      res = WriteData(fn,*buf,len)
    EndIf 
    FreeMemory(*buf) 
    CloseFile(fn) 
    ProcedureReturn res  
  EndIf 
  
EndProcedure    
CompilerIf #PB_Compiler_IsMainFile
  
  EnableExplicit
  
  Procedure MakeRandom(*buf.Ascii,seed=0) 
    Protected a, len,c  
    If seed <> 0 
      RandomSeed(seed)
    EndIf 
    len = (MemorySize(*buf)-1) 
    For a = 0 To Len 
      c = Random(3) 
      Select c 
        Case 0 
          *buf\a = 'A' 
        Case 1 
          *buf\a = 'C' 
        Case 2 
          *buf\a = 'G' 
        Case 3 
          *buf\a = 'T' 
      EndSelect 
      *buf+1 
    Next   
    
  EndProcedure  
  
  
  Global bloom.ibloom,bloom2,bloom3  
  Global k=31
  Global ers.d = 1.0 / 1024    ;max errors   
  Global size = 1024*1024
  Global size2 = 2 * size 
  Global size3
  Global out.s 
  Global *buf,*pc
  Global st,et,et1,stm,etm,etm1,a,ct,ct1,ct2,ct3,x
  
  *buf = AllocateMemory(size2)
  MakeRandom(*buf) 
  
  bloom = Bloom_new(size,ers) 
  
  Global NewMap mp(size) 
  
  st= ElapsedMilliseconds() 
  For a = 1 To size
    bloom_Set(bloom,*buf+a,k)
  Next 
  et = ElapsedMilliseconds()  
  For a = 1 To size
    ct + bloom_get(bloom,*buf+a,k)
  Next 
  et1 = ElapsedMilliseconds()  
  
  stm= ElapsedMilliseconds() 
  For a = 1 To size
    mp(PeekS(*buf+a,k,#PB_Ascii))=1
  Next 
  etm = ElapsedMilliseconds()  
  For a = 1 To size
    x=mp(PeekS(*buf+a,k,#PB_Ascii))
  Next 
  etm1 = ElapsedMilliseconds()  
  
  ;test for false positives, items that haven't been added to the set. 
  For a = size To size2
    ct1 + bloom_get(bloom,*buf+a,k) 
  Next 
  
  *pc = bloom\Compress()  ;compress bloom to buffer 
  If *pc 
    size3 = PeekL(*pc) 
  EndIf   
  
  bloom2 = Bloom_Decompress(*pc) ;returns a new bloom 
  FreeMemory(*pc) 
  
  For a = 1 To size
    ct2 + bloom_get(bloom2,*buf+a,k)
  Next 
  
  bloom3 = Bloom_new(size,ers) 
  Bloom_OR(bloom3,bloom2) 
  
  If  Bloom_Save(bloom3,"testbloom.blm") 
    Bloom_Free(bloom3)
    bloom3 = Bloom_load("testbloom.blm")
  EndIf    
  
  For a = 1 To size
    ct3 + bloom_get(bloom3,*buf+a,k)
  Next 
  
  
  out = "bloom items = " + FormatNumber(size,0,".",",") + " memory = " + StrF(bloom_GetSize(bloom) / 1024 / 1024,2) + " mb" + #CRLF$
  out + "Compressed size " + Str(size3) + " b" +  Str(size3 / 1024) + " kb  " + StrF(size3 / 1024 / 1024,2) + " mb" + #CRLF$
  out + "Bloom get = " + Str(ct) +#CRLF$
  out + "Bloom2 get = " + Str(ct2) +#CRLF$
  out + "Bloom3 get = " + Str(ct3) +#CRLF$
  
  out + "max false positives expected = " + Str(size*ers) + " actual " + Str(ct1) + #CRLF$ 
  out + "Bloom set " + Str(et-st) + " ms" + #CRLF$  
  out + "Bloom get " + Str(et1-et) + " ms" + #CRLF$  
  out + "map set " + Str(etm-stm) + " ms" + #CRLF$  
  out + "map get " + Str(etm1-etm) + " ms" + #CRLF$  
  
  
  SetClipboardText(out) 
  
  MessageRequester("bloom",out) 
  
  bloom_Free(bloom) 
  
CompilerEndIf   


