Page 1 of 1

Find a string in a file

Posted: Mon Aug 25, 2008 5:11 pm
by Little John
Works also with PB 5.20

I hope the code (+ comments) is pretty self-explanatory. The file can be of arbitrary size.

Regards, Little John

edit:
There is another version in the third post.

Code: Select all

; tested with PB 4.20

EnableExplicit

Procedure.q FindInFile (infile.s, search.s, start.q=0, matchCase.l=#True, maxChunk.l=4096)
   ; -- Look in <infile> for string <search>, beginning at offset <start>;
   ;    works in ASCII mode and Unicode mode.
   ; Note: The first offset in the file is 0 (not 1)!
   ; out: * offset in the file, where <search> was found
   ;      * -1: <search> was not found in <infile>
   ;      * -2: <infile> couldn't be opened for reading
   ;      * -3: the size of <search> is bigger than <maxChunk>
   Protected buffer.s
   Protected move.l, ifn.l, bytes.l, posn.q

   move = maxChunk - StringByteLength(search) + 1
   If move < 1
      ProcedureReturn -3            ; error
   EndIf

   ifn = ReadFile(#PB_Any, infile)
   If ifn = 0
      ProcedureReturn -2            ; error
   EndIf

   If matchCase = #False
      search = UCase(search)
   EndIf
   If SizeOf(Character) = 2         ; Unicode mode
      buffer = Space(maxChunk/2+1)
   Else
      buffer = Space(maxChunk)
   EndIf

   Repeat
      FileSeek(ifn, start)
      bytes = ReadData(ifn, @buffer, maxChunk)
      If matchCase = #False
         buffer = UCase(buffer)
      EndIf
      posn = FindString(buffer, search, 1) - 1
      If posn <> -1                 ; found
         If SizeOf(Character) = 2   ; Unicode mode
            posn * 2
         EndIf
         posn + start
         Break
      EndIf
      start + move
   Until bytes < maxChunk

   CloseFile(ifn)
   ProcedureReturn posn
EndProcedure


;-- Demo
Define infile.s, search.s

infile = "source.txt"
search = "Hello World!"
Debug FindInFile(infile, search)

Posted: Tue Aug 26, 2008 7:56 pm
by SFSxOI
Thanks Little John, I might have a use for this.

Posted: Wed Aug 27, 2008 5:39 am
by Little John
Works also with PB 5.20

You are welcome! I'm glad when it is useful for you.

BTW, the code in the first post only works when there are no NULL bytes in the file.
Here is another version, that also works with any binary files.

edit 2015-09-10:
- Improved the example, so that it works with different file formats, and it doesn't matter whether the program is compiled in ASCII mode or Unicode mode.
- Some other small improvements.

Code: Select all

; tested with PB 5.31

EnableExplicit

Structure ByteArray
   byte.b[0]
EndStructure

Procedure.i QuickSearch (*mainMem.ByteArray, mainSize.i, *findMem.ByteArray, findSize.i, startOff.i=0)
   ; -- Simplification of the Boyer-Moore algorithm;
   ;    searches for a sequence of bytes in memory
   ;    (not for characters, so it works in ASCII mode and Unicode mode)
   ; in : *mainMem: pointer to memory area where to search
   ;      mainSize: size of memory area where to search (bytes)
   ;      *findMem: pointer to byte sequence to search for
   ;      findSize: number of bytes to search for
   ;      startOff: offset in <mainMem>, where the search begins (bytes)
   ; out: offset in <mainMem>, where <findMem> was found (bytes);
   ;      -1 if not found
   ; Note: The first offset is 0 (not 1)!
   ;
   ; after <http://www-igm.univ-mlv.fr/~lecroq/string/node19.html#SECTION00190>, 31.8.2008
   ; (translated from C to PureBasic by Little John)
   Protected i.i, diff.i
   Protected Dim badByte.i(255)
   
   ; Preprocessing
   For i = 0 To 255
      badByte(i) = findSize + 1
   Next
   For i = 0 To findSize - 1
      badByte(*findMem\byte[i] & #FF) = findSize - i
   Next
   
   ; Searching
   diff = mainSize - findSize
   While startOff <= diff
      If CompareMemory(*mainMem + startOff, *findMem, findSize) = 1
         ProcedureReturn startOff
      EndIf
      startOff + badByte(*mainMem\byte[startOff + findSize] & #FF)  ; shift
   Wend
   
   ProcedureReturn -1                                               ; not found
EndProcedure


Procedure.q FindInFile (infile.i, *find, findSize.i, startOff.q=0, bufferSize.i=4096)
   ; -- Looks in <infile> for byte sequence at *find;
   ;    works in ASCII mode and Unicode mode.
   ; in : infile    : number of a file, that was opened for reading
   ;      *find     : pointer to byte sequence to search for
   ;      findSize  : number of bytes to search for
   ;      startOff  : offset in the file where the search begins (bytes)
   ;      bufferSize: size of used memory buffer (bytes)
   ; out: offset in the file, where byte sequence at *find was found (bytes),
   ;      -1 if byte sequence at *find was not found in <infile>,
   ;      -2 on error
   ; Note: The first offset is 0 (not 1)!
   Protected *buffer
   Protected offset.q, move.i, bytes.i
   
   move = bufferSize - findSize + 1
   If move < 1
      ProcedureReturn -2                 ; error
   EndIf
   
   *buffer = AllocateMemory(bufferSize)
   If *buffer = 0
      ProcedureReturn -2                 ; error
   EndIf
   
   Repeat
      FileSeek(infile, startOff)
      bytes = ReadData(infile, *buffer, bufferSize)
      ; QuickSearch returns the offset in the buffer (bytes),
      ; or -1 if not found:
      offset = QuickSearch(*buffer, bytes, *find, findSize)
      If offset <> -1                    ; found
         offset + startOff
         Break
      EndIf
      startOff + move
   Until bytes < bufferSize
   
   FreeMemory(*buffer)
   ProcedureReturn offset
EndProcedure


; -- Demo
Define file$, search$, ifn.i, format.i, numBytes.i, *searchBuffer, found.q

file$ = "source.txt"
search$ = "Äpfel"

ifn = ReadFile(#PB_Any, file$)
If ifn
   format = ReadStringFormat(ifn)
   ; If the above line does not work (because there is no BOM in the file), select the file format manually:
   ; format = #PB_Ascii
   ; format = #PB_UTF8
   ; format = #PB_Unicode
   
   numBytes = StringByteLength(search$, format)
   *searchBuffer = AllocateMemory(numBytes+2)
   If *searchBuffer
      PokeS(*searchBuffer, search$, -1, format)
      found = FindInFile(ifn, *searchBuffer, numBytes)
      Select found
         Case -2
            Debug "Error."
         Case -1
            Debug "'" + search$ + "' not found in file '" + file$ + "'."
         Default
            Debug "'" + search$ + "' found in file '" + file$ + "' at offset " + Str(found) + "."
      EndSelect     
      FreeMemory(*searchBuffer)
   Else   
      Debug "Error allocating memory for search string."
   EndIf
   CloseFile(ifn)
Else
   Debug "Error reading from file '" + file$ + "'."
EndIf
Regards, Little John

Re: Find a string in a file

Posted: Thu Sep 10, 2015 1:49 pm
by LiK137
Strange that does not work if enable unicode in compiler options in either 4.x or 5.x.
And have tested many infile search codes, none of them successful.
Any suggest?

Re: Find a string in a file

Posted: Thu Sep 10, 2015 3:07 pm
by infratec
Hi,

since bytes are searched, you have to use a search buffer with the correct bytes inside.
Use PokeS() with the format flag which correspond to your file.
#PB_Ascii, PB_UTF8 or PB_Unicode.

Because you will not find the bytesequence of an unicode string in an Ascii text file.

Bernd

Re: Find a string in a file

Posted: Thu Sep 10, 2015 4:58 pm
by Little John
infratec wrote:since bytes are searched, you have to use a search buffer with the correct bytes inside.
Use PokeS() with the format flag which correspond to your file.
#PB_Ascii, PB_UTF8 or PB_Unicode.

Because you will not find the bytesequence of an unicode string in an Ascii text file.
Perfect explanation. :-)

In the third post, I have changed the example in order to demonstrate exactly what Bernd wrote.
I am sorry, that example was not good before.

Re: Find a string in a file

Posted: Thu Sep 10, 2015 7:43 pm
by LiK137
ThanQ very much for eXplanation

Re: Find a string in a file

Posted: Thu Sep 10, 2015 8:25 pm
by davido
@Little John,

Thank you for going to the trouble to translate it, and then sharing it.
I bet your code looks far prettier than the original code did in C.

Re: Find a string in a file

Posted: Wed Mar 02, 2016 1:25 am
by normeus
I needed to search files for certain text and return the full line so I modded this code by Little John
It will return the line where the match was found. There is no noticeable slowdown compared to the original ( I was searching 300Meg text files in under 8 seconds)
It is working fine for me with Win7 Pro & PB 5.41 LTS

Code: Select all

; tested with PB 5.31 / PB 5.41 LTS
; DESC: little john search in files
; INFO: http://www.purebasic.fr/english/viewtopic.php?p=256625#p256625

; added a way to get the current line
; change eline$ to your OS linefeed, currently set to windows
EnableExplicit

Structure ByteArray
   byte.b[0]
 EndStructure
 
 Structure foundpos ; added this structure to return position of match, start and end of line.
   found.q
   lstart.q
   lend.q
 EndStructure
 

Procedure.i QuickSearch (*mainMem.ByteArray, mainSize.i, *findMem.ByteArray, findSize.i, startOff.i=0)
   ; -- Simplification of the Boyer-Moore algorithm;
   ;    searches for a sequence of bytes in memory
   ;    (not for characters, so it works in ASCII mode and Unicode mode)
   ; in : *mainMem: pointer to memory area where to search
   ;      mainSize: size of memory area where to search (bytes)
   ;      *findMem: pointer to byte sequence to search for
   ;      findSize: number of bytes to search for
   ;      startOff: offset in <mainMem>, where the search begins (bytes)
   ; out: offset in <mainMem>, where <findMem> was found (bytes);
   ;      -1 if not found
   ; Note: The first offset is 0 (not 1)!
   ;
   ; after <http://www-igm.univ-mlv.fr/~lecroq/string/node19.html#SECTION00190>, 31.8.2008
   ; (translated from C to PureBasic by Little John)
   Protected i.i, diff.i
   Protected Dim badByte.i(255)
   ;
   Protected begline.i
   ; Preprocessing
   For i = 0 To 255
      badByte(i) = findSize + 1
   Next
   For i = 0 To findSize - 1
      badByte(*findMem\byte[i] & #FF) = findSize - i
   Next
   
   ; Searching
   diff = mainSize - findSize
   While startOff <= diff
     ;If CompareMemory(*mainMem + startOff,)=1
     ;  begline=startOff  
     ;EndIf  
     If CompareMemory(*mainMem + startOff, *findMem, findSize) = 1       
         ProcedureReturn startOff
      EndIf
      startOff + badByte(*mainMem\byte[startOff + findSize] & #FF)  ; shift
   Wend
   
   ProcedureReturn -1                                               ; not found
EndProcedure


Procedure.l FindInFile (infile.i, *find, findSize.i, *eline,elineSize.i,startOff.q=0, bufferSize.i=8192)
   ; -- Looks in <infile> for byte sequence at *find;
   ;    works in ASCII mode and Unicode mode.
   ; in : infile    : number of a file, that was opened for reading
   ;      *find     : pointer to byte sequence to search for
   ;      findSize  : number of bytes to search for
   ;      startOff  : offset in the file where the search begins (bytes)
   ;      bufferSize: size of used memory buffer (bytes)
   ; out: offset in the file, where byte sequence at *find was found (bytes),
   ;      -1 if byte sequence at *find was not found in <infile>,
   ;      -2 on error
   ; Note: The first offset is 0 (not 1)!
   Protected *buffer
   Protected offset.q, move.i, bytes.i
   Protected sline.q,eline.q,dummy.q
   Protected *retval.foundpos=AllocateMemory(SizeOf(foundpos))
   
   move = bufferSize - findSize + 1
   If move < 1
      ProcedureReturn -2                 ; error
   EndIf
   
   *buffer = AllocateMemory(bufferSize)
   If *buffer = 0
      ProcedureReturn -2                 ; error
   EndIf
   
   Repeat
     FileSeek(infile, startOff)
     bytes = ReadData(infile, *buffer, bufferSize)
     ; QuickSearch returns the offset in the buffer (bytes),
     ; or -1 if not found:
     sline  = QuickSearch(*buffer, bytes, *eline, elineSize)
     offset = QuickSearch(*buffer, bytes, *find, findSize)
     If offset <> -1                    ; found
       eline  = QuickSearch(*buffer, bytes, *eline, elineSize,offset) ; find linefeed from our found word
     ;  sline  = QuickSearch(*buffer, bytes, *eline, elineSize) ; find first line feed in current mem block
       If sline > offset
         sline=0
       Else  
       dummy=sline+elineSize
       Repeat
         sline=dummy+elineSize
         dummy=QuickSearch(*buffer, bytes, *eline, elineSize,sline)
       Until  dummy = -1 Or dummy > offset 
       EndIf
       Debug "begin line sline" +StrD(sline)
       Debug "linefeed eline" +StrD(eline)
       
       offset + startOff
       Break
     EndIf
     startOff + move
   Until bytes < bufferSize
   
   *retval\lstart=sline+startOff
   *retval\lend=eline+startOff
   *retval\found=offset
   FreeMemory(*buffer)
   ProcedureReturn *retval;offset
EndProcedure



; -- Demo
Define file$, search$, ifn.i, format.i, numBytes.i, *searchBuffer, *found.foundpos

Define elineBytes.i,eline$,*elbuffer,*readline  

file$ = "largefiletosearch.txt"
search$ = "findThisText"



ifn = ReadFile(#PB_Any, file$)
If ifn
   format = ReadStringFormat(ifn)
   ; If the above line does not work (because there is no BOM in the file), select the file format manually:
   ; format = #PB_Ascii
   ; format = #PB_UTF8
   ; format = #PB_Unicode
   
   
   eline$=Chr(13)+Chr(10); "\r\n"
   elineBytes=StringByteLength(eline$, format)
   *elBuffer=AllocateMemory(elineBytes+2)
   PokeS(*elBuffer,eline$,-1,format)
   
   numBytes = StringByteLength(search$, format)
   *searchBuffer = AllocateMemory(numBytes+2)
   If *searchBuffer
      PokeS(*searchBuffer, search$, -1, format)
      *found = FindInFile(ifn, *searchBuffer, numBytes,*elBuffer,elineBytes)
      Select *found\found
         Case -2
            Debug "Error."
         Case -1
            Debug "'" + search$ + "' not found in file '" + file$ + "'."
         Default
           Debug "'" + search$ + "' found in file '" + file$ + "' at offset " + Str(*found\found) + "."
           Debug "line starts at: "+Str(*found\lstart)+" and ends at: " + Str(*found\lend)
           FileSeek(ifn,*found\lstart)
           *readline=AllocateMemory(*found\lend-*found\lstart+2)
           ReadData(ifn,*readline,*found\lend-*found\lstart)
           Debug PeekS(*readline,*found\lend-*found\lstart,format)
        EndSelect  
      FreeMemory(*readline)  
      FreeMemory(*found)  
      FreeMemory(*searchBuffer)
   Else   
      Debug "Error allocating memory for search string."
   EndIf
   CloseFile(ifn)
Else
   Debug "Error reading from file '" + file$ + "'."
EndIf
Norm