Page 1 of 1

Parsing a file and counting lines and words

Posted: Sat Nov 02, 2002 10:46 am
by BackupUser
Code updated for 5.20+

Restored from previous forum. Originally posted by fweil.

Thanks to Art Sentinel challenge ... I place here a copy of the code I made to parse a file and count lines and words.

Please go to [url]viewtopic.php?t=2450">http://forums.pur ... php?t=2450 for understanding the history.

This code allows to open a file and load it in memory for a fast process. It counts lines, words and opens a notepad with the list of unique words and count of each.

I did not use linked lists because I am not used to work with it and I believe trying to code with it performances were a bit slower than using arrays.

The file to memory procedures are updates I made from Horst's sample code available at

Code: Select all

CompilerIf #PB_Compiler_Unicode
  Global Dim AsciiConv.s(65535)
CompilerElse      
  Global Dim AsciiConv.s(255)
CompilerEndIf

Global Dim AllWords.s(10000000)
Global Dim UniqueWords.s(1000000)
Global Dim WordCount.l(1000000)
Global NLines.l, NWords.l, EOL.s, AsciiConv, Allwords, UniqueWords, WordCount
Global MemFileOffset.l, MemFileSize.l, CurrentDirectory.s, *FileBuffer
#FileBufferMem = 0
Procedure.l LoadFileToMem(fileID,fname.s)
  If ReadFile(fileID,fname)
    MemFileSize = Lof(fileID)
    Debug "MemFileSize = " + Str(MemFileSize)
    *FileBuffer = AllocateMemory(MemFileSize)
    If *FileBuffer
      ReadData(fileID, *FileBuffer, MemFileSize)
    EndIf
    CloseFile(fileID)
    MemFileOffset = 0 ; reset
  EndIf
  ProcedureReturn *FileBuffer
EndProcedure
Procedure MoreInMem()
  If MemFileOffset < MemFileSize
    ok = 1
  EndIf
  ProcedureReturn ok
EndProcedure
Procedure.s ReadLineFromMem() ; in case EOF: empty line is returned
 ; *FileBuffer = UseMemory(#FileBufferMem)
  If *FileBuffer And MoreInMem()
    Start = *FileBuffer + MemFileOffset
    length = 0
    Repeat
      length + 1
      Byte.b = PeekB(Start + length)
    Until  Byte = 13 Or Byte = 10 Or MemFileOffset + length >= MemFileSize
  EndIf
  Skip = 1
  *addr = Start + length + 1
  Debug *addr
  Byte = PeekB(*addr)
  If Byte = 10 Or Byte = 13
    length + 1
    Skip + 1
  EndIf
  MemFileOffset + length
  ProcedureReturn PeekS(Start + 1, length - Skip, #PB_Ascii)
EndProcedure
Procedure CloseFileMem()
  FreeMemory(#FileBufferMem)
EndProcedure
Procedure.l IMod(a.l, b.l)
  ProcedureReturn a - (b * (a / b))
EndProcedure
Procedure ParseFile(FileName.s)
  Debug "Parsing : " + FileName
  SetGadgetText(100, "Processing file " + FileName)
  CurrentDirectory = GetPathPart(FileName)
  If LoadFileToMem(0, FileName)
    While MoreInMem()
      NLines + 1
      a$ = LTrim(RTrim(ReadLineFromMem()))
      b$ = ""
      For i = 1 To Len(a$)
        b$ = b$ + AsciiConv(Asc(Mid(a$, i, 1)))
      Next
      While FindString(b$, "  ", 1) <> 0
        b$ = ReplaceString(b$, "  ", " ")
      Wend
      b$ = LTrim(RTrim(b$))
      If Len(b$) <> 0
        While FindString(b$, " ", 1) <> 0
          AllWords(NWords) = Mid(b$, 1, FindString(b$, " ", 1) - 1)
          NWords + 1
          b$ = Mid(b$, FindString(b$, " ", 1) + 1, Len(b$) - FindString(b$, " ", 1) - 1 + 1)
        Wend
        AllWords(NWords) = b$
        NWords + 1
      EndIf
      If IMod(NLines, 2500) = 0
        StatusBarText(0, 0, "Parsing line #" + Str(NLines) + " ... found " + Str(NWords) + " words.", 0)
      EndIf
    Wend
    StatusBarText(0, 0, "Parsing line #" + Str(NLines) + " ... found " + Str(NWords) + " words.", 0)
  EndIf
EndProcedure
;;;
Quit.l = #False
WindowXSize.l = 320
WindowYSize.l = 240
CurrentDirectory = Space(255)
GetCurrentDirectory_(255, @CurrentDirectory)
EOL.s = Chr(13) + Chr(10)
For i = 0 To 255
  AsciiConv(i) = Chr(i)
Next
AsciiConv(Asc(".")) = " "
AsciiConv(Asc(",")) = " "
AsciiConv(Asc(":")) = " "
AsciiConv(Asc(";")) = " "
AsciiConv(Asc("+")) = " "
AsciiConv(Asc("-")) = " "
AsciiConv(Asc("*")) = " "
AsciiConv(Asc("/")) = " "
AsciiConv(Asc("(")) = " "
AsciiConv(Asc(")")) = " "
AsciiConv(Asc("[")) = " "
AsciiConv(Asc("]")) = " "
AsciiConv(Asc("'")) = " "
AsciiConv(Asc("!")) = " "
AsciiConv(Asc("?")) = " "
AsciiConv(Asc("{")) = " "
AsciiConv(Asc("}")) = " "
AsciiConv(Asc("=")) = " "
AsciiConv(Asc("<")) = " "
AsciiConv(Asc(">")) = " "
AsciiConv(Asc(Chr(34))) = " "
AsciiConv(Asc(Chr(9))) = " "
hwnd.l = OpenWindow(0, 200, 500, WindowXSize, WindowYSize, "MyWindow", #PB_Window_SystemMenu | #PB_Window_MinimizeGadget | #PB_Window_MaximizeGadget | #PB_Window_SizeGadget | #PB_Window_TitleBar)
If hwnd
  AddKeyboardShortcut(0, #PB_Shortcut_Escape, 99)
  ;fontVerd11.l = LoadFont(0,"Verdana",12)
  If CreateMenu(0, WindowID(0))
    OpenSubMenu("General")
    MenuItem(11, "Open file")
    MenuItem(99, "Quit")
    CloseSubMenu()
  EndIf
  If CreateStatusBar(0, WindowID(0))
    AddStatusBarField(200)
    StatusBarText(0, 0, "Idle ...", 0)
  EndIf
  
  ;SetGadgetFont(#PB_Default,FontID(fontVerd11.l))
  TextGadget(100, 10, 10, WindowXSize - 20, WindowYSize - 40, "")
  
  SetGadgetText(100, "Select a file to process ...")
  Repeat
    Select WaitWindowEvent()
      Case #PB_Event_CloseWindow
        Quit = #True
      Case #PB_Event_Menu
        Select EventMenu()
          Case 11
            FileName.s = OpenFileRequester("Select a file", CurrentDirectory + "\" + "*.txt", "Text files|*.txt|All files|*.*", 0, #PB_Requester_MultiSelection)
            NLines.l = 0
            NWords.l = 0
            tz.l = GetTickCount_()
            ParseFile(FileName)
            NWords - 1
            SetGadgetText(100, "File : " + FileName + EOL + "Lines : " + Str(NLines) + EOL + "Words : " + Str(NWords + 1))
            SortArray(AllWords(), 0, 0, NWords)
            j = 0
            UniqueWords(j) = AllWords(j)
            WordCount(j) = 1
            For i = 1 To NWords
              If AllWords(i) <> AllWords(i - 1)
                j + 1
                UniqueWords(j) = AllWords(i)
                WordCount(j) = 1
              Else
                WordCount(j) + 1
              EndIf
            Next
            NUniqueWords.l = j
            SetGadgetText(100, "File : " + FileName + EOL + "Lines : " + Str(NLines) + EOL + "Words : " + Str(NWords + 1) + EOL + "Unique words : " + Str(NUniqueWords + 1) + EOL + "Done in " + Str(GetTickCount_() - tz) + "ms")
            If CreateFile(0, "result.txt")
              For z = 0 To NUniqueWords
                WriteStringN(0, Str(z) + " " + UniqueWords(z) + Chr(9) + Chr(9) + Str(WordCount(z)))
              Next
              CloseFile(0)
            EndIf
            ShellExecute_(hwnd,"open","result.txt","","",#SW_SHOWNORMAL)
          Case 99
            Quit = #True
        EndSelect
    EndSelect
  Until Quit
EndIf
FreeMemory(*FileBuffer)
End
Francois Weil
14, rue Douer
F64100 Bayonne

Posted: Sat Nov 02, 2002 3:51 pm
by BackupUser
Restored from previous forum. Originally posted by horst.

Hi Francois,

I see you took advantage of my buffered file reading :)
with Peek instead of ASM. Should be fast enough, normally.
I made these functions for a commercial program, where
speed was very important (especially when the file is
scanned over the network).

Two problems I found in your source:

(1) When lines are terminated by CR or LF only, you
won't catch empty lines properly. You should skip these
bytes only if not the same as the preceding one.

(2) You don't catch the first word of the file correctly,
because you peekS at Start+1 to return the string.

And: you should use CloseFileMem() to release the memory block.

BTW: You need not declare Arrays as Global (that will only
make a long variable with the same name).



Horst

Posted: Sat Nov 02, 2002 7:13 pm
by BackupUser
Restored from previous forum. Originally posted by fweil.

Thnx Horst for your code and comments.

KRgrds

Francois Weil
14, rue Douer
F64100 Bayonne

Posted: Thu Nov 07, 2002 10:14 am
by BackupUser
Restored from previous forum. Originally posted by horst.

I made a new version of the buffered line reading
with Peek instead of InlineASM, and I tested the speed
(2.5Mb file with 20,000 lines):

ASM version: 0.10 sec
Peek version: 0.16 sec
ReadString(): 5.42 sec, network: 17 minutes (!)

The Peek version should be fast enough in most cases,
and can be easily modified if necessary.

http://home.mnet-online.de/horst.muc/pb/


Horst

Posted: Mon May 29, 2006 6:15 am
by Randy Walker
Well that was a bit of a chore ... restoring all the CRs after pasting the sample code above. Anyway, also very impressed by the performace so I'm posting the converted PB4.0 format to spare others the trouble. Thanks again to Art and Fweil ...

Code: Select all

Global Dim AsciiConv.s(255)
Global Dim AllWords.s(10000000)
Global Dim UniqueWords.s(1000000)
Global Dim WordCount.l(1000000)
Global NLines.l, NWords.l, EOL.s, AsciiConv, Allwords, UniqueWords, WordCount
Global MemFileOffset.l, MemFileSize.l, CurrentDirectory.s, *FileBuffer
#FileBufferMem = 0
Procedure.l LoadFileToMem(fileID,fname.s)
  If ReadFile(fileID,fname)
    MemFileSize = Lof(fileID)
    Debug "MemFileSize = " + Str(MemFileSize)
    *FileBuffer = AllocateMemory(MemFileSize)
    If *FileBuffer
      ReadData(fileID, *FileBuffer, MemFileSize)
    EndIf
    CloseFile(fileID)
    MemFileOffset = 0 ; reset
  EndIf
  ProcedureReturn *FileBuffer
EndProcedure
Procedure MoreInMem()
  If MemFileOffset < MemFileSize
    ok = 1
  EndIf
  ProcedureReturn ok
EndProcedure
Procedure.s ReadLineFromMem() ; in case EOF: empty line is returned
 ; *FileBuffer = UseMemory(#FileBufferMem)
  If *FileBuffer And MoreInMem()
    Start = *FileBuffer + MemFileOffset
    length = 0
    Repeat
      length + 1
      Byte.b = PeekB(Start + length)
    Until  Byte = 13 Or Byte = 10 Or MemFileOffset + length >= MemFileSize
  EndIf
  Skip = 1
  *addr = Start + length + 1
  Debug *addr
  Byte = PeekB(*addr)
  If Byte = 10 Or Byte = 13
    length + 1
    Skip + 1
  EndIf
  MemFileOffset + length
  ProcedureReturn PeekS(Start + 1, length - Skip)
EndProcedure
Procedure CloseFileMem()
  FreeMemory(#FileBufferMem)
EndProcedure
Procedure.l IMod(a.l, b.l)
  ProcedureReturn a - (b * (a / b))
EndProcedure
Procedure ParseFile(FileName.s)
  Debug "Parsing : " + FileName
  SetGadgetText(100, "Processing file " + FileName)
  CurrentDirectory = GetPathPart(FileName)
  If LoadFileToMem(0, FileName)
    While MoreInMem()
      NLines + 1
      a$ = LTrim(RTrim(ReadLineFromMem()))
      b$ = ""
      For i = 1 To Len(a$)
        b$ = b$ + AsciiConv(Asc(Mid(a$, i, 1)))
      Next
      While FindString(b$, "  ", 1) <> 0
        b$ = ReplaceString(b$, "  ", " ")
      Wend
      b$ = LTrim(RTrim(b$))
      If Len(b$) <> 0
        While FindString(b$, " ", 1) <> 0
          AllWords(NWords) = Mid(b$, 1, FindString(b$, " ", 1) - 1)
          NWords + 1
          b$ = Mid(b$, FindString(b$, " ", 1) + 1, Len(b$) - FindString(b$, " ", 1) - 1 + 1)
        Wend
        AllWords(NWords) = b$
        NWords + 1
      EndIf
      If IMod(NLines, 2500) = 0
        StatusBarText(0, 0, "Parsing line #" + Str(NLines) + " ... found " + Str(NWords) + " words.", 0)
      EndIf
    Wend
    StatusBarText(0, 0, "Parsing line #" + Str(NLines) + " ... found " + Str(NWords) + " words.", 0)
  EndIf
EndProcedure
;;;
Quit.l = #False 
WindowXSize.l = 320
WindowYSize.l = 240
CurrentDirectory = Space(255)
GetCurrentDirectory_(255, @CurrentDirectory)
EOL.s = Chr(13) + Chr(10)
For i = 0 To 255
  AsciiConv(i) = Chr(i)
Next
AsciiConv(Asc(".")) = " "
AsciiConv(Asc(",")) = " "
AsciiConv(Asc(":")) = " "
AsciiConv(Asc(";")) = " "
AsciiConv(Asc("+")) = " "
AsciiConv(Asc("-")) = " "
AsciiConv(Asc("*")) = " "
AsciiConv(Asc("/")) = " "
AsciiConv(Asc("(")) = " "
AsciiConv(Asc(")")) = " "
AsciiConv(Asc("[")) = " "
AsciiConv(Asc("]")) = " "
AsciiConv(Asc("'")) = " "
AsciiConv(Asc("!")) = " "
AsciiConv(Asc("?")) = " "
AsciiConv(Asc("{")) = " "
AsciiConv(Asc("}")) = " "
AsciiConv(Asc("=")) = " "
AsciiConv(Asc("<")) = " "
AsciiConv(Asc(">")) = " "
AsciiConv(Asc(Chr(34))) = " "
AsciiConv(Asc(Chr(9))) = " "
hwnd.l = OpenWindow(0, 200, 500, WindowXSize, WindowYSize, "MyWindow", #PB_Window_SystemMenu | #PB_Window_MinimizeGadget | #PB_Window_MaximizeGadget | #PB_Window_SizeGadget | #PB_Window_TitleBar)
If hwnd
  AddKeyboardShortcut(0, #PB_Shortcut_Escape, 99)
  ;fontVerd11.l = LoadFont(0,"Verdana",12)
  If CreateMenu(0, WindowID(0))
    OpenSubMenu("General")
    MenuItem(11, "Open file")
    MenuItem(99, "Quit")
    CloseSubMenu()
  EndIf
  If CreateStatusBar(0, WindowID(0))
    StatusBarText(0, 0, "Idle ...", 0)
  EndIf
  If CreateGadgetList(WindowID(0))
    ;SetGadgetFont(#PB_Default,FontID(fontVerd11.l))
    TextGadget(100, 10, 10, WindowXSize - 20, WindowYSize - 40, "")
  EndIf
  SetGadgetText(100, "Select a file to process ...")
  Repeat
    Select WaitWindowEvent()
      Case #PB_Event_CloseWindow
        Quit = #True
      Case #PB_Event_Menu
        Select EventMenu()
          Case 11
            FileName.s = OpenFileRequester("Select a file", CurrentDirectory + "\" + "*.txt", "Text files|*.txt|All files|*.*", 0, #PB_Requester_MultiSelection)
            NLines.l = 0
            NWords.l = 0
            tz.l = GetTickCount_()
            ParseFile(FileName)
            NWords - 1
            SetGadgetText(100, "File : " + FileName + EOL + "Lines : " + Str(NLines) + EOL + "Words : " + Str(NWords + 1))
            SortArray(AllWords(), 0, 0, NWords)
            j = 0
            UniqueWords(j) = AllWords(j)
            WordCount(j) = 1
            For i = 1 To NWords
              If AllWords(i) <> AllWords(i - 1)
                j + 1
                UniqueWords(j) = AllWords(i)
                WordCount(j) = 1
              Else
                WordCount(j) + 1
              EndIf
            Next
            NUniqueWords.l = j
            SetGadgetText(100, "File : " + FileName + EOL + "Lines : " + Str(NLines) + EOL + "Words : " + Str(NWords + 1) + EOL + "Unique words : " + Str(NUniqueWords + 1) + EOL + "Done in " + Str(GetTickCount_() - tz) + "ms")
            If CreateFile(0, "result.txt")
              For z = 0 To NUniqueWords
                WriteStringN(0, Str(z) + " " + UniqueWords(z) + Chr(9) + Chr(9) + Str(WordCount(z)))
              Next
              CloseFile(0)
            EndIf
            ShellExecute_(hwnd,"open","result.txt","","",#SW_SHOWNORMAL)
          Case 99
            Quit = #True
        EndSelect
    EndSelect
  Until Quit
EndIf
FreeMemory(*addr)
End