Strip/replace double or multiple spaces with single
Posted: Fri Nov 13, 2015 9:57 am
I needed to replace all multiple spaces to single, but the nature of ReplaceString() only gets half the work done when you're replacing a string with half of the exact same string, like two spaces with one:
This replaces the 8 space chars with 4, so there are still multiple spaces left over.
To solve this it seems a loop is required?? first to check if there are any double-spaces left, and then replacing if required:
But i need to do this lottttts and there should be a more efficient way than looping like that, so i wrote what i guess is a typical(?) "copy backwards valid chars from ptrB to ptrA", incrementing both pointers as necessary (and for speed reasons this modifies the string you give it - it doesn't return a separate string):
I get really good performance out of that! but being in an assembly mood this week i thought i'd try my luck with making an asm version, which was fun as im enjoying learning
Timing tests, including ones posted later in this thread ...
My short string is "one two three four five six seven eight nine end" (53% space chars), which is 84 bytes.
My long string is 2000 copies of that (for a total string len of 168kb).
Then there are the Unicode versions also!
I try 5 million calls to the short buffer, and 10 thousand calls to the long one.
Code: Select all
mystr.s = "one two"
mystr = ReplaceString(mystr, " ", " ")
Debug(mystr)
To solve this it seems a loop is required?? first to check if there are any double-spaces left, and then replacing if required:
Code: Select all
Procedure.s StripDualSpaces_ReplaceString(sStr.s) ;Win+Linux+Mac, 32+64, Unicode+Ascii
Repeat
If FindString(sStr, " ") = 0: Break: EndIf ;ReplaceString(" " with " ") doesnt work with multiple spaces,
sStr.s = ReplaceString(sStr, " ", " ") ;so we need to repeatedly call it until there are no more.
ForEver
ProcedureReturn sStr
EndProcedure
But i need to do this lottttts and there should be a more efficient way than looping like that, so i wrote what i guess is a typical(?) "copy backwards valid chars from ptrB to ptrA", incrementing both pointers as necessary (and for speed reasons this modifies the string you give it - it doesn't return a separate string):
Code: Select all
DisableDebugger
Procedure StripDualSpacesPBptr(*pstr) ;Win+Linux+Mac, 32+64, Unicode+Ascii
Protected *pbyteA.Character, *pbyteB.Character, SpaceFlag.i, Increment.i
*pbyteA = *pstr
*pbyteB = *pbyteA
CompilerIf #PB_Compiler_Unicode = 1
Increment = 2
CompilerElse
Increment = 1
CompilerEndIf
Repeat
If *pbyteB\c = 0 ;Char=Null
Break
ElseIf *pbyteB\c = 32 ;Char=Space
If SpaceFlag = 0
SpaceFlag = 1 ;(1st space)
*pbyteA\c = *pbyteB\c
*pbyteA+Increment: *pbyteB+Increment
Else ;(2nd+ space)
*pbyteB+Increment
EndIf
Else ;Char=Other (not space or null)
SpaceFlag = 0
*pbyteA\c = *pbyteB\c
*pbyteA+Increment: *pbyteB+Increment
EndIf
ForEver
*pbyteA\c = 0
EndProcedure
EnableDebugger
I get really good performance out of that! but being in an assembly mood this week i thought i'd try my luck with making an asm version, which was fun as im enjoying learning

Code: Select all
CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
Macro rax : eax : EndMacro ;thanks again wilbert for these helpers!
Macro rbx : ebx : EndMacro
Macro rcx : ecx : EndMacro
CompilerEndIf
DisableDebugger ;Win+Linux+Mac, 32+64, Unicode+Ascii
CompilerIf #PB_Compiler_Unicode
Procedure StripDualSpacesAsm(*pstr)
EnableASM
mov rax, *pstr
push rbx
mov rbx, rax
!nextbyte:
mov dx, [rbx]
! cmp dx, 0 ;Null?
! je endproc
! cmp dx, 32 ;Space?
! jne normalchar ;Other
!spacechar:
cmp rcx, 0
! jne dualspace
!firstspace:
mov rcx, 1
mov [rax], dx
add rax, 2
add rbx, 2
! jmp nextbyte
!dualspace:
add rbx, 2
! jmp nextbyte
!normalchar:
XOr rcx, rcx
mov [rax], dx
add rax, 2
add rbx, 2
! jmp nextbyte
!endproc:
mov [rax], word 0
pop rbx
DisableASM
EndProcedure
CompilerElse
Procedure StripDualSpacesAsm(*pstr)
EnableASM
mov rax, *pstr
push rbx
mov rbx, rax
!nextbyte:
mov dl, [rbx]
! cmp dl, 0 ;Null?
! je endproc
! cmp dl, 32 ;Space?
! jne normalchar ;Other
!spacechar:
cmp rcx, 0
! jne dualspace
!firstspace:
mov rcx, 1
mov [rax], dl
inc rax
inc rbx
! jmp nextbyte
!dualspace:
inc rbx
! jmp nextbyte
!normalchar:
XOr rcx, rcx
mov [rax], dl
inc rax
inc rbx
! jmp nextbyte
!endproc:
mov [rax], byte 0
pop rbx
DisableASM
EndProcedure
CompilerEndIf
EnableDebugger
Timing tests, including ones posted later in this thread ...
My short string is "one two three four five six seven eight nine end" (53% space chars), which is 84 bytes.
My long string is 2000 copies of that (for a total string len of 168kb).
Then there are the Unicode versions also!
I try 5 million calls to the short buffer, and 10 thousand calls to the long one.
Code: Select all
TimePB=Native PB, FindString+ReplaceString loop
TimeK1=Keya's PB, BytePtr-based
TimeK2=Keya's asm, BytePtr-based w/ 8bit ops
TimeW1=wilbert's asm, BytePtr-based w/ 32bit ops
TimeR1=Rashad's PB, StringField-based
TimeR2=Rashad's PB, Peek-based
TimeI1=IdeasVacuum's PB, StringField-based
OS=Linux-64 Char=Ascii String len=84, trying 5000000 calls...
TimeR1=38568
TimeR2=33397
TimeI1=20884
TimePB=6655
TimeK1=1145
TimeK2=643
TimeW1=422
OS=Linux-64 Char=Unicode String len=84, trying 5000000 calls...
TimeR1=99061
TimeR2=50534
TimeI1=22502
TimePB=8557
TimeK1=1259
TimeK2=791
TimeW1=582
OS=Linux-64 Char=Ascii String len=168000, trying 10000 calls...
TimeR1=184175000
TimeR2=53705000
TimeI1=51403000
TimePB=21622
TimeK1=3954
TimeK2=2138
TimeW1=1235
OS=Linux-64 Char=Unicode String len=168000, trying 10000 calls...
TimeR1=531260000
TimeR2=175150000
TimeI1=60638000
TimePB=24686
TimeK1=4657
TimeK2=2709
TimeW1=1841
---
OS=Linux-32 Char=Ascii String len=84, trying 5000000 calls...
TimeR2=39688
TimeR1=35749
TimeI1=26846
TimePB=9655
TimeK1=1121
TimeK2=564
TimeW1=402
OS=Linux-32 Char=Unicode String len=84, trying 5000000 calls...
TimeR1=99186
TimeR2=62115
TimeI1=26096
TimePB=9200
TimeK1=1342
TimeK2=728
TimeW1=651
OS=Linux-32 Char=Ascii String len=168000, trying 10000 calls...
TimeR1=148315000
TimeI1=65492000
TimeR2=58475000
TimePB=22914
TimeK1=3881
TimeK2=1725
TimeW1=1144
OS=Linux-32 Char=Unicode String len=168000, trying 10000 calls...
TimeR1=511220000
TimeR2=175540000
TimeI1=57704000
TimePB=25697
TimeK1=4630
TimeK2=2207
TimeW1=1883
---
OS=OSX-64 Char=Ascii String len=84, trying 5000000 calls...
TimeR2=60728
TimeR1=47592
TimeI1=35529
TimePB=8584
TimeK1=1459
TimeK2=833
TimeW1=749
OS=OSX-64 Char=Unicode String len=84, trying 5000000 calls...
TimeR1=129911
TimeR2=96344
TimeI1=42651
TimePB=11463
TimeK1=2040
TimeK2=1503
TimeW1=1359
OS=OSX-64 Char=Ascii String len=168000, trying 10000 calls...
TimeR1=185485000
TimeR2=151470000
TimeI1=70608000
TimePB=23684
TimeK1=5089
TimeK2=2802
TimeW1=2286
OS=OSX-64 Char=Unicode String len=168000, trying 10000 calls...
TimeR1=582020000
TimeR2=353820000
TimeI1=80147000
TimePB=32381
TimeK1=6991
TimeK2=4804
TimeW1=4315
---
OS=Win-32 Char=Ascii String len=84, trying 5000000 calls...
TimeR2=67536
TimeR1=59299
TimeI1=30923
TimePB=22804
TimeK1=1552
TimeK2=1024
TimeW1=765
OS=Win-32 Char=Unicode String len=84, trying 5000000 calls...
TimeR1=117095
TimeR2=93698
TimeI1=32484
TimePB=11562
TimeK1=2054
TimeK2=1555
TimeW1=1457
OS=Win-32 Char=Ascii String len=168000, trying 10000 calls...
TimeR1=264375000
TimeR2=231745000
TimeI1=58647000
TimePB=84465 ;anomaly
TimeK1=5261
TimeK2=3610
TimeW1=2508
OS=Win-32 Char=Unicode String len=168000, trying 10000 calls...
TimeR1=570505000
TimeR2=375740000
TimeI1=75652000
TimePB=34182
TimeK1=7585
TimeK2=5629
TimeW1=4787
---
OS=Win-64 Char=Ascii String len=84, trying 5000000 calls...
TimeR2=71279
TimeR1=62363
TimeI1=34368
TimePB=9304
TimeK1=1475
TimeK2=991
TimeW1=699
OS=Win-64 Char=Unicode String len=84, trying 5000000 calls...
TimeR1=136522
TimeR2=96323
TimeI1=38987
TimePB=11055
TimeK1=1927
TimeK2=1628
TimeW1=1382
OS=Win-64 Char=Ascii String len=168000, trying 10000 calls...
TimeR1=278125000
TimeR2=225950000
TimeI1=77789000
TimePB=25608
TimeK1=5100
TimeK2=3363
TimeW1=2333
OS=Win-64 Char=Unicode String len=168000, trying 10000 calls...
TimeR1=626865000
TimeR2=345905000
TimeI1=87042000
TimePB=28811
TimeK1=6845
TimeK2=5005
TimeW1=4062