I just posted this code in the german forum as an answer and I thought it might be usefull for some others here, too.
Code: Select all
; Try's to gather the characterset of the text in the given memory area.
; If you set ByteLength.i = 0, it assumes a null terminated string.
; Possible return-codes:
; "utf-16LE-bom" -> contains utf-16LE encoded text including a bom
; "utf-16BE-bom" -> contains utf-16BE encoded text including a bom
; "utf-8bom" -> contains utf-8 encoded text including a bom
; "utf-8" -> contains utf-8 encoded text
; "ascii" -> only 7 bit chars
; "iso" -> contains chars > 7 bit, but not utf-8
Procedure.s GetCharactersetMemory(MemPointer.i, ByteLength.i = 0)
If PeekA(MemPointer.i + 0) = 239 And PeekA(MemPointer.i + 1) = 187 And PeekA(MemPointer.i + 2) = 191
; correct UTF8 BOM
ProcedureReturn "utf-8bom"
EndIf
If PeekA(MemPointer.i + 0) = $FF And PeekA(MemPointer.i + 1) = $FE
; correct UTF16LE BOM
ProcedureReturn "utf-16LE-bom"
EndIf
If PeekA(MemPointer.i + 0) = $FE And PeekA(MemPointer.i + 1) = $FF
; correct UTF16LE BOM
ProcedureReturn "utf-16BE-bom"
EndIf
Protected Code.a = 0, AddBytes.i = 0
Protected x = 0, a = 0
Protected ValidUTF8 = #True ; init (to negotiate)
Protected IsASC = #True ; init (to negotiate)
Repeat
Code = PeekA(MemPointer.i + x)
If Code > 127 And ValidUTF8 = #True
IsASC = #False
; This may be the beginning of a UTF8 char
If Code & %11100000 = %11000000 ; 1 additional byte
AddBytes = 1
ElseIf Code & %11110000 = %11100000 ; 2 additional byte
AddBytes = 2
ElseIf Code & %11111000 = %11110000 ; 3 additional byte
AddBytes = 3
ElseIf Code & %11111100 = %11111000 ; 4 additional byte
AddBytes = 4
ElseIf Code & %11111110 = %11111100 ; 5 additional byte
AddBytes = 5
Else
ValidUTF8 = #False
Break ; no utf8, because it does not fit the standard
EndIf
; validate utf8 characters
For a = 1 To AddBytes
x = x + 1
Code = PeekA(MemPointer.i + x)
If Code & %11000000 <> %10000000
ValidUTF8 = #False
Break; no utf8, because following bytes do not match "10xxxxxx"
EndIf
Next
EndIf
x = x + 1
Until (x >= ByteLength.i And ByteLength.i > 0) Or Code = 0
If ValidUTF8 = #True
; found a utf8 start byte followed by at least one following byte (needed for valid utf8)
ProcedureReturn "utf-8"
EndIf
If IsASC = #True
ProcedureReturn "ascii"
EndIf
ProcedureReturn "iso"
EndProcedure
; Peeking strings from pointers that point to single-byte strings in memory
; It detects the encoding and ensures that it is correctly returned as multibyte or singlebyte.
; Works with unicode memory areas.
Procedure.s PeekSSmart(Memory.i, Length.i)
Protected CharSet.s = GetCharactersetMemory(Memory.i, Length.i)
Select CharSet.s
Case "utf-16LE-bom"
ProcedureReturn PeekS(Memory.i+2, (Length.i-2)/2, #PB_Unicode) ; peek as UTF16LE ignoring BOM
Case "utf-16BE-bom"
Debug "utf-16BE-bom not supportet by PeekSSmart()"
ProcedureReturn ""
; ProcedureReturn PeekS(Memory.i+2, (Length.i-2)/2, #PB_UTF16BE) ; peek as UTF16BE ignoring BOM
Case "utf-8bom"
ProcedureReturn PeekS(Memory.i+3, Length.i-3, #PB_UTF8) ; peek as utf8 ignoring BOM
Case "utf-8"
ProcedureReturn PeekS(Memory.i, Length.i, #PB_UTF8) ; peek as utf8
Case "iso"
ProcedureReturn PeekS(Memory.i, Length.i, #PB_Ascii) ; peek as single byte
Case "ascii"
ProcedureReturn PeekS(Memory.i, Length.i, #PB_Ascii) ; peek as single byte
EndSelect
EndProcedure
Kukulkan