read file, get lines, preserve newlines/linebreaks

Share your advanced PureBasic knowledge/code with the community.
#NULL
Addict
Addict
Posts: 1499
Joined: Thu Aug 30, 2007 11:54 pm
Location: right here

read file, get lines, preserve newlines/linebreaks

Post by #NULL »

Some code for reading a file into a list of lines while preserving the lineendings/newlines/linebreaks.
There are two versions. One reads the complete file into a buffer and then scans the buffer for lines and linebreaks. The other version reads each line directly via ReadString() and then repositions back via FileSeek() to get the linebreak.
Its handles CRLF, CR and LF. It also preserves mixed linebreaks as well as preserves last lines not ending with a linebreak. For constistency, empty files will result in one empty line with an empty newline and last lines not terminated with a newline will also have an empty newline in the sLine structure. So if you just write back to a file each line's \str and \newline fields you should end up with the exact file (if you rewrite the BOM and format too).

getLines_reseek() is slightly faster than getLines_scanBuffer() in some tests, but scanBuffer gets much slower with debugger enabled:

Code: Select all

; with    debugger : scanBuffer 10mb : 3527
; with    debugger : reseek 10mb     : 581
; without debugger : scanBuffer 10mb : 529
; without debugger : reseek 10mb     : 509
tested on USB stick. similar results on SSD. I'm using Linux here.

with a file that is larger but has fewer and longer lines, time for scanBuffer increases but time for reseek acutally decreases:

Code: Select all

; without debugger :
; scanBuffer 10mb (~100 chars per line, 100000 lines) : 502
; scanBuffer 23mb (~500 chars per line,  45285 lines) : 902
; reseek     10mb (~100 chars per line, 100000 lines) : 397
; reseek     23mb (~500 chars per line,  45285 lines) : 370
this thread might be related though not exactly the same task: viewtopic.php?f=5&t=67194

here's the code (in Linux enable Executable Format Console to see something):

Code: Select all

Structure sLine
  str.s
  newline.s
EndStructure

; reads complete file into buffer and scans for lines and linebreaks
Procedure getLines_scanBuffer(filename.s, List lines.sLine())
  Protected file, format, src.s, str.s, l
  Protected *pSrc.Character
  Protected *pLineStart.Character
  Protected *pLineEnd.Character
  Protected newline.s
  
  file = ReadFile(#PB_Any, filename)
  If file
    
    format = ReadStringFormat(file)
    ;log::l("BOM: " + Loc(file) + " bytes")
    
    ;stopwatch::restart("read into buffer")
    src = ReadString(file, format | #PB_File_IgnoreEOL)
    ;log::l(stopwatch::stopGetInfo("read into buffer"))
    CloseFile(file)
    
    ;stopwatch::restart("get lines")
    *pSrc = @src
    *pLineStart = *pSrc
    *pLineEnd = *pSrc
    newline = ""
    While *pSrc\c
      
      If PeekS(*pSrc, 2) = #CRLF$ ; (might peek into null byte to the right)
        newline = #CRLF$
      ElseIf *pSrc\c = #CR
        newline = #CR$
      ElseIf *pSrc\c = #LF
        newline = #LF$
      EndIf
      
      If newline
        *pLineEnd = *pSrc
        AddElement(lines())
        lines()\str = PeekS(*pLineStart, (*pLineEnd - *pLineStart) / SizeOf(Character))
        lines()\newline = newline
        *pLineStart = *pSrc + Len(newline) * SizeOf(Character)
        *pSrc + Len(newline) * SizeOf(Character)
        newline = ""
      Else
        *pSrc + SizeOf(Character)
      EndIf
    Wend
    
    If *pSrc = @src
      ; empty file has one empty line and no linebreak
      AddElement(lines())
      lines()\str = ""
      lines()\newline = ""
    ElseIf *pSrc > *pLineStart
      ; take remaining charascters if not terminated by a linebreak
      *pLineEnd = *pSrc
      AddElement(lines())
      lines()\str = PeekS(*pLineStart, (*pLineEnd - *pLineStart) / SizeOf(Character))
      lines()\newline = ""
    EndIf
    
    ;log::l(stopwatch::stopGetInfo("get lines"))
  Else
    MessageRequester("error","can't read file '" + filename + "'")
  EndIf
EndProcedure

; get lines using ReadString() and get linebreaks using FileSeek(-n)
Procedure getLines_reseek(filename.s, List lines.sLine())
  Protected file, format
  Protected lengthOfBOM, lengthOfFile, lengthOfData, locData
  Protected c1.Character, c2.Character
  
  file = ReadFile(#PB_Any, filename)
  If file
    format = ReadStringFormat(file)
    ;log::l("BOM: " + Loc(file) + " bytes")
    lengthOfBOM  = Loc(file)
    lengthOfFile = Lof(file)
    lengthOfData = lengthOfFile - lengthOfBOM
    If lengthOfData > 0
      While Not Eof(file)
        AddElement(lines())
        lines()\str = ReadString(file, format)
        locData = Loc(file) - lengthOfBOM
        
        c1\c = 0
        c2\c = 0
        
        Select format
          Case #PB_Ascii, #PB_UTF8
            If locData >= 2
              FileSeek(file, -2, #PB_Relative)
              c1\c = ReadCharacter(file, format)
              c2\c = ReadCharacter(file, format)
            ElseIf locData = 1  
              FileSeek(file, -1, #PB_Relative)
              c2\c = ReadCharacter(file, format)
            EndIf
          Case #PB_Unicode ; (utf-16)
            If locData >= 4
              FileSeek(file, -4, #PB_Relative)
              c1\c = ReadCharacter(file, format)
              c2\c = ReadCharacter(file, format)
            ElseIf locData = 2
              FileSeek(file, -2, #PB_Relative)
              c2\c = ReadCharacter(file, format)
            EndIf
        EndSelect
        
        If c1\c = #CR And c2\c = #LF
          lines()\newline = #CRLF$
        ElseIf c2\c = #CR
          lines()\newline = #CR$
        ElseIf c2\c = #LF
          lines()\newline = #LF$
        Else
          lines()\newline = ""
        EndIf
        
      Wend
    Else
      ; empty file has one empty line and no linebreak
      AddElement(lines())
      lines()\str = ""
      lines()\newline = ""
    EndIf
    
    CloseFile(file)
  Else
    MessageRequester("error","can't read file '" + filename + "'")
  EndIf
EndProcedure

CompilerIf #PB_Compiler_IsMainFile
  
  CompilerIf #PB_Compiler_Debugger
    MessageRequester("", "debugger is enabled")
  CompilerEndIf
  
  OpenConsole("title")
  
  Procedure printLines(List lines.sLine())
    Protected i, newline.s
    ForEach lines()
      newline = ""
      For i = 1 To Len(lines()\newline)
        newline + Asc(Mid(lines()\newline, i, 1)) + " "
      Next
      PrintN(".... " + #DQUOTE$ + lines()\str + #DQUOTE$ + " (" + Trim(newline) + ")")
    Next
  EndProcedure
  
  NewList lines.sLine()
  
  getLines_scanBuffer(#PB_Compiler_File, lines())
  printLines(lines())
  
  PrintN("") : PrintN("") : PrintN("") : PrintN("")
  
  ClearList(lines())
  getLines_reseek(#PB_Compiler_File, lines())
  printLines(lines())
  
  Input()
CompilerEndIf