Page 1 of 2

CSV and Quotes

Posted: Fri Jul 20, 2007 3:46 pm
by pdwyer
************************
Code updated 2008-05-23
This code at the top is now
the (hopefully) bug free version
of the ported code found later
************************


I needed a proc like stringfield() except that it had to handle quoted strings. Often a field will have a comma in it so it gets quotes put on it like:

Firstname,lastname,City,country,DisplayName,email
Paul,Dwer,Tokyo,Japan,35,"Dwyer, Paul",PaulsEmail@PaulsDomain.com

StringField will return

Paul
Dwyer
Tokyo
Japan
36
"Dwyer
Paul"

rather than what excel will show and save:

Paul
Dwer
Tokyo
Japan
36
Dwyer, Paul
PaulsEmail@PaulsDomain.com

I was surprised not to find this proc in the tips area already... so here it is. Hopefully bug free and perhaps needing some performance tweaks. Test code added


Code: Select all



;port from http://www.xbeat.net/vbspeed/c_ParseCSV.php
; some minor deliberate variations

Structure MemoryArray 
    Byte.c[0] 
EndStructure

Declare.l ParseCSV(CSVLine.s, OutVals.s(1)) 
Dim Vals.s(0)


OpenFile(1,"F:\Programming\PureBasicCode\csv.csv") ;change this!!!!!
While Not Eof(1)

    CSVString.s = ReadString(1)
    ValCount = ParseCSV(CSVString,Vals())
    Debug "Column Count: " + Str(ValCount) + "      " + CSVString
    
    For i = 0 To valcount -1
        Debug vals(i)
    Next
    
Wend    
CloseFile(1)


Procedure.l ParseCSV(CSVLine.s, OutVals.s(1)) ; Returns count

    #lAscSpace     = 32   ; Asc(" ")
    #lAscQuote     = 34   ; Asc("""")
    #lAscSeparator = 44   ; Asc(","), comma    
    #lValueNone    = 0 ; states of the parser
    #lValuePlain   = 1
    #lValueQuoted  = 2
        
    Protected ubValues.l
    Protected cntValues.l  
    Protected lCharCode.l
    Protected posStart.l
    Protected posEnd.l
    Protected cntTrim.l
    Protected lState .l
    Protected i.l
    Protected StrLen.l = Len(CSVLine) 
     
    If StrLen > 0 
    
        *abExpression.MemoryArray = @CSVLine         ; to byte array
        ReDim OutVals.s(cntValues) 
        
        For i = 0 To StrLen -1

            lCharCode = *abExpression\byte[i] 
            Select lCharCode
                
                Case #lAscSpace
                    If lState = #lValuePlain 
                        ; at non-quoted value: trim 2 unicode bytes for each space
                        cntTrim = cntTrim + 1;2
                    EndIf
                
                Case #lAscSeparator
                    If lState = #lValueNone 
                        ; ends zero-length value
                        ReDim OutVals.s(cntValues)
                        OutVals(cntValues) = ""
                        cntValues = cntValues + 1
                        posStart = i + 1;2
                    ElseIf lState = #lValuePlain 
                        ; ends non-quoted value
                        lState = #lValueNone
                        posEnd = i - cntTrim
                        ReDim OutVals.s(cntValues)
                        OutVals(cntValues) = Mid(CSVLine, posStart + 1, posEnd - posStart)
                        OutVals(cntValues) = ReplaceString(OutVals(cntValues),Chr(34)+Chr(34),Chr(34))
                        cntValues = cntValues + 1
                        posStart = i + 1;2
                        cntTrim = 0
                    EndIf
                
                Case #lAscQuote
                    If lState = #lValueNone 
                        ; starts quoted value
                        lState = #lValueQuoted
                        ; trims the opening quote
                        posStart = i + 1;2
                    ElseIf lState = #lValueQuoted 
                        ; ends quoted value, or is a quote within
                        If *abExpression\byte[i+1] = #lAscQuote
                            i = i + 1  
                              
                        Else
                            lState = #lValuePlain                       
                            cntTrim = 1;2 ; trims the closing quote
                        EndIf
                        
                    EndIf
                
                Default
                    If lState = #lValueNone 
                        ; starts non-quoted value
                        lState = #lValuePlain
                        posStart = i
                    EndIf
                      ; reset trimming
                      cntTrim = 0                  
            EndSelect       
        Next
        
        ; remainder
        posEnd = i - cntTrim
        If cntValues <> ubValues 
            ReDim OutVals.s(cntValues)
        EndIf
        
        OutVals(cntValues) = Mid(CSVLine, posStart + 1, posEnd - posStart)
        OutVals(cntValues) = ReplaceString(OutVals(cntValues),Chr(34)+Chr(34),Chr(34))
        
        ProcedureReturn cntValues + 1
        
        Else
          ; (Expression = "")
          ; return single-element array containing a zero-length string
          ReDim OutVals.s(0)
          OutVals.s(0) = ""
          ProcedureReturn 1
    
    EndIf

EndProcedure

Re: CSV and Quotes

Posted: Fri Jul 20, 2007 3:48 pm
by PB
> ProcedureReturn = StringField(CSV_line, Column, ",")

That isn't supposed to have an = character in there.

Posted: Fri Jul 20, 2007 3:52 pm
by pdwyer
thanks,

Mine works anyway but the beta version and future might not. I'll fix it now...

Posted: Fri Jul 20, 2007 3:55 pm
by PB
ProcedureReturn is never meant to have an equals sign in it.

Posted: Sat Jul 21, 2007 12:30 am
by jear
@pdwyer

Solved the same problem this way:

Code: Select all

Declare.l CSVLearn(CSV_line.s, Delimiter.s = ",")
Declare.s CSVField(CSV_line.s, Column.l, Delimiter.s = ",") 

; ====== TEST CODE =========== 

CSVString.s = "string1,string2," + #DOUBLEQUOTE$ + "Lastname, Firstname"  + #DOUBLEQUOTE$ + ",string4,string5," + #DOUBLEQUOTE$ + "Lastname2, Firstname2"  + #DOUBLEQUOTE$ 

CSVLearn(CSVString)           ; to examine the line structure
Debug CSVString 
Debug CSVField(CSVString,5) 
Debug CSVField(CSVString,1) 
Debug CSVField(CSVString,3) 
Debug CSVField(CSVString,6) 
Debug CSVField(CSVString,4) 
Debug CSVField(CSVString,2) 
 
Debug CSVField(CSVString,7)   ; false field index checks 
Debug CSVField(CSVString,8) 
Debug CSVField(CSVString,99) 
Debug CSVField(CSVString,-277) 

; ====== Procedures ===========

Global Dim offsets.l(1) 
Procedure CSVLearn(CSV_line.s, Delimiter.s = ",")
  Protected idx.l, Field.l = 1
  Protected sField.s
  Shared nFields.l : nFields = CountString(CSV_line, Delimiter) + 1
  Global Dim offsets.l(nFields)
  While idx < nFields 
    idx + 1
    sField = StringField(CSV_line, idx, Delimiter)
    If     Right(sField, 1) = #DOUBLEQUOTE$  : Continue
    ElseIf Left(sField, 1) = #DOUBLEQUOTE$   : offsets(Field) = -idx : Field + 1
    Else                                     : offsets(Field) = idx  : Field + 1 : EndIf 
  Wend
  nFields = Field - 1 ; remember number of true fields
EndProcedure

Procedure.s CSVField(CSV_line.s, Column.l, Delimiter.s = ",")
  Protected result.s
  Shared nFields
  If Column > nFields Or Column < -nFields : ProcedureReturn "? field index ?" : EndIf   
  If offsets(Column) < #Null  
    result = StringField(CSV_line, -offsets(Column), Delimiter) + Delimiter + StringField(CSV_line, -offsets(Column) + 1, Delimiter)
  Else
    result = StringField(CSV_line, offsets(Column), Delimiter)
  EndIf 
  ProcedureReturn result 
EndProcedure

Posted: Sat Jul 21, 2007 1:00 am
by pdwyer
Thanks, I'll have a play, I tested with indexes too high but didn't try negative so that I want to see.

Do you have a function for CSVCount to work like the CountString() in this situation? Just checking before I try to write one :)

Posted: Sun Jul 22, 2007 7:17 am
by pdwyer
@jear, Yours is a little different to mine in the display of quotes, personally I prefer them stripped out but not a big deal.

I have done a CSVCount() function and put it together here.

Code: Select all


Declare.s CSVField(CSV_line.s, Column.l)
Declare.l CSVCount(CSV_line.s)

; ====== TEST CODE ===========

CSVString.s = "string1," + #DOUBLEQUOTE$ + "Lastname, Firstname"  + #DOUBLEQUOTE$ + ",string4,string5"

;CSVString.s = "string1,string2,string3"
Debug Str(CSVCount(CSVString)) + " csv columns found!"

ColCount = CSVCount(CSVString)
For i = 1 To ColCount
    Debug CSVField(CSVString,i)
Next

; ====== TEST CODE ===========


Procedure.s CSVField(CSV_line.s, Column.l)

    CurrentCol.l = 1  
    CurrentPos.l = 1
    InQuote.l = #False
    NextComma.l 
    NextQuote.l
    CurrentCol.l = 1
    LastColFound.l = #False
    
    CSV_line = RemoveString(CSV_line, #CRLF$)
    
    If FindString(CSV_line, #DOUBLEQUOTE$, 1) = 0 ; no quotes found, take shortcut
        ProcedureReturn StringField(CSV_line, Column, ",")
    EndIf

    CSVLen = Len(CSV_line)
    While (CurrentPos < CSVLen) 

        If InQuote = #False
            NextComma = FindString(CSV_line, ",", CurrentPos)    
            NextQuote = FindString(CSV_line, #DOUBLEQUOTE$, CurrentPos)
                       
            If NextComma = 0 : NextComma = CSVLen : EndIf
            If NextQuote = 0 : NextQuote = CSVLen : EndIf

            If (NextComma < NextQuote) ;text To Next comma is field

                If CurrentCol = Column ; this is the requested col
                    ProcedureReturn Mid(CSV_line, CurrentPos, NextComma - CurrentPos)
                Else
                    ;move on to next
                    CurrentCol + 1
                    CurrentPos = NextComma + 1
                EndIf 
            ElseIf NextComma > NextQuote ; ignore the comma as it's in quotes
                InQuote = #True
                CurrentPos = NextQuote + 1
            Else ;zero's returned  
                If CurrentCol = Column
                    ProcedureReturn Mid(CSV_line, CurrentPos, CSVLen)   
                Else
                    ProcedureReturn ""
                EndIf 
            EndIf
            
        Else  ;inside quote, jump to next
            InQuote = #False
            NextQuote = FindString(CSV_line, #DOUBLEQUOTE$, CurrentPos)
            If CurrentCol = Column ; this is the requested col
                ProcedureReturn Mid(CSV_line, CurrentPos, NextQuote - CurrentPos)
            Else
                ;move on to next
                CurrentCol + 1
                NextComma = FindString(CSV_line, ",", NextQuote)
                If NextComma = 0
                    NextComma = CSVLen
                EndIf
                CurrentPos = NextComma + 1
            EndIf            
        
        EndIf
    Wend        

EndProcedure

;==================================================================

Procedure.l CSVCount(CSV_line.s)

    CurrentCol.l = 1  
    CurrentPos.l = 1
    InQuote.l = #False
    NextComma.l 
    NextQuote.l
    CurrentCol.l = 1
    LastColFound.l = #False
    
    CSV_line = RemoveString(CSV_line, #CRLF$)
    
    If FindString(CSV_line, #DOUBLEQUOTE$, 1) = 0 ; no quotes found, take shortcut
        ProcedureReturn CountString(CSV_line, ",") + 1
    EndIf

    CSVLen = Len(CSV_line)
    While (CurrentPos < CSVLen) 

        If InQuote = #False
            NextComma = FindString(CSV_line, ",", CurrentPos)    
            NextQuote = FindString(CSV_line, #DOUBLEQUOTE$, CurrentPos)
                       
            If NextComma = 0 : NextComma = CSVLen : EndIf
            If NextQuote = 0 : NextQuote = CSVLen : EndIf

            If (NextComma < NextQuote) ;text To Next comma is field

                CurrentCol + 1
                CurrentPos = NextComma + 1

            ElseIf NextComma > NextQuote ; ignore the comma as it's in quotes
                InQuote = #True
                CurrentPos = NextQuote + 1
            Else ;zero's returned  
                ProcedureReturn CurrentCol
            EndIf
            
        Else  ;inside quote, jump to next
            InQuote = #False
            NextQuote = FindString(CSV_line, #DOUBLEQUOTE$, CurrentPos)

            CurrentCol + 1
            NextComma = FindString(CSV_line, ",", NextQuote)
            If NextComma = 0
                NextComma = CSVLen
            EndIf
            CurrentPos = NextComma + 1        
        EndIf
    Wend        

    ProcedureReturn CurrentCol -1

EndProcedure


Posted: Sun Jul 22, 2007 10:45 am
by Kiffi
@pdwyer:

if the CSV-String includes an odd count of doublequotes your code gets into
an endless loop.

Code: Select all

CSVString.s = "string1," + #DOUBLEQUOTE$ + "Lastname, Firstname" + ",string4,string5"
Greetings ... Kiffi

Posted: Sun Jul 22, 2007 12:37 pm
by pdwyer
thanks... Will take a look and repost

Posted: Sun Jul 22, 2007 12:58 pm
by pdwyer
Okay, I see what you mean now. Basically it's not handling this type of invalid csv or 'difficult' csv. I'll have to have a think about it.

This is valid and fails CSVString.s = "lkjlkjlkj,lkjlkjlkj," + #DOUBLEQUOTE$ + "lkj,l;k" + #DOUBLEQUOTE$ + "," + #DOUBLEQUOTE$ + ";lkj" + #DOUBLEQUOTE$ + "" + #DOUBLEQUOTE$ + " ;lk" + #DOUBLEQUOTE$ + "," + #DOUBLEQUOTE$ + "sdfsf" + #DOUBLEQUOTE$ + "" + #DOUBLEQUOTE$ + " , " + #DOUBLEQUOTE$ + ""

it looks like this if you open excels csv file in notepad, it contains not only commas in the display text but double quotes too

lkjlkjlkj,lkjlkjlkj,"lkj,l;k",";lkj"" ;lk","sdfsf"" , "

Column display should look like

lkjlkjlkj
lkjlkjlkj
lkj,l;k
;lkj" ;lk
sdfsf" ,


I found a list of rules for CSV on my favourite source so I'll try a proper implementation...

http://en.wikipedia.org/wiki/Comma-separated_values

It seems you are allowed to embed crlf in the cell for a two line cell too! I didn't know that

Posted: Sun Jul 22, 2007 5:06 pm
by Dummy
no real CSV, but it's able to handle such with some limitations:
http://www.purebasic.fr/english/viewtopic.php?t=26791

Posted: Tue May 20, 2008 2:28 pm
by Kiffi
Hello pdwyer,
pdwyer wrote:I found a list of rules for CSV on my favourite source so I'll try a proper
implementation...
any news? ;-)

Greetings ... Kiffi

Posted: Wed May 21, 2008 11:05 am
by pdwyer
:oops: nope,

I never needed it and it fell off the back burner.

I might dig it up later as I think PB needs a good CSV tool and if I can write a crappy but working one then the geniuses around here can fix it up for me :D

I was looking for a small project too :P

Posted: Wed May 21, 2008 12:22 pm
by pdwyer
The wiki article has been updated, there's some code linked on other pages. This one looks nice in VB and will mean we can have a normal and unicode version.

http://www.xbeat.net/vbspeed/c_ParseCSV.php

I'll give porting it a go :)

Posted: Wed May 21, 2008 2:40 pm
by pdwyer
okay, here is my first port attempt, it's not heavily tested so there's bound to be patching needed. Seems to work in a basic sense though, haven't tried unicode either but I guess it might work as is due to the "character" type in the pointer structure

Code: Select all


Structure MemoryArray 
    Byte.c[0] 
EndStructure

Declare.l ParseCSV(CSVLine.s, OutVals.s(1)) 

Dim Vals.s(0)
CSVString.s = "aa,"+Chr(34)+"bb,cc"+Chr(34)+",dd,ee"
ValCount = ParseCSV(CSVString,Vals())

Debug valcount

For i = 0 To valcount -1
    Debug vals(i)
Next


Procedure.l ParseCSV(CSVLine.s, OutVals.s(1)) ; Returns count


    #lAscSpace     = 32   ; Asc(" ")
    #lAscQuote     = 34   ; Asc("""")
    #lAscSeparator = 44   ; Asc(","), comma
    
    #lValueNone    = 0 ; states of the parser
    #lValuePlain   = 1
    #lValueQuoted  = 2
        
    Protected ubValues.l
    Protected cntValues.l
    
    Protected lCharCode.l
    Protected posStart.l
    Protected posEnd.l
    Protected cntTrim.l
    Protected lState .l
    Protected i.l
    Protected StrLen.l = Len(CSVLine) 
  
    If StrLen > 0 
    
        *abExpression.MemoryArray = @CSVLine         ; to byte array

        ReDim OutVals.s(cntValues) 
        
        For i = 0 To StrLen -1

          lCharCode = *abExpression\byte[i] 

            Select lCharCode
                
                Case #lAscSpace
                    If lState = #lValuePlain 
                        ; at non-quoted value: trim 2 unicode bytes for each space
                        cntTrim = cntTrim + 1;2
                    EndIf
                
                Case #lAscSeparator
                    If lState = #lValueNone 
                        ; ends zero-length value
                        ReDim OutVals.s(cntValues)
                        OutVals(cntValues) = ""
                        cntValues = cntValues + 1
                        posStart = i + 1;2
                    ElseIf lState = #lValuePlain 
                        ; ends non-quoted value
                        lState = #lValueNone
                        posEnd = i - cntTrim
                        ReDim OutVals.s(cntValues)
                        OutVals(cntValues) = Mid(CSVLine, posStart + 1, posEnd - posStart)
                        cntValues = cntValues + 1
                        posStart = i + 1;2
                        cntTrim = 0
                    EndIf
                
                Case #lAscQuote
                    If lState = #lValueNone 
                        ; starts quoted value
                        lState = #lValueQuoted
                        ; trims the opening quote
                        posStart = i + 1;2
                    ElseIf lState = #lValueQuoted 
                        ; ends quoted value, or is a quote within
                        lState = #lValuePlain
                        ; trims the closing quote
                        cntTrim = 1;2
                    EndIf
                
                Default
                    If lState = #lValueNone 
                        ; starts non-quoted value
                        lState = #lValuePlain
                        posStart = i
                    EndIf
                      ; reset trimming
                      cntTrim = 0                  
            EndSelect       
        Next
        
        ; remainder
        posEnd = i - cntTrim
        If cntValues <> ubValues 
            ReDim OutVals.s(cntValues)
        EndIf
        
        OutVals(cntValues) = Mid(CSVLine, posStart + 1, posEnd - posStart)

        ProcedureReturn cntValues + 1

        Else
          ; (Expression = "")
          ; return single-element array containing a zero-length string
          ReDim OutVals.s(0)
          OutVals.s(0) = ""
          ProcedureReturn 1
    
    EndIf

EndProcedure