Page 1 of 1

ReadChar(fH, Format) and ReadUTF8Character()

Posted: Thu Aug 14, 2014 7:38 pm
by Tenaja
This uses Wilberts UTF8Size() routine.

Code: Select all

    Procedure.i UTF8Size(FirstByte.a)
      !movzx eax, byte [p.v_FirstByte]
      !shr al, 3
      CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
        !lea rdx, [utf8size0]
        !mov al, [rdx + rax]
      CompilerElse
        !mov al, [utf8size0 + eax]
      CompilerEndIf
      ProcedureReturn
      !utf8size0:
      !db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
      !db 0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,0
    EndProcedure

Procedure.i ReadUTF8Character(fPtr.i)
	Protected.i utf8Character
	Protected.i ByteCounter
	If Eof(fPtr)						; Make sure we have something to read
		ProcedureReturn 0
	EndIf
	
	; Read first byte
	utf8Character = ReadByte(fPtr)
	If utf8Character = 0				; an error
		ProcedureReturn 0
	EndIf
	
	ByteCounter = UTF8Size(utf8Character)
	
	; Read the rest of the character
	While ByteCounter > 1
		If Eof(fPtr)					; verify another byte exists.
			ProcedureReturn 0
		EndIf
		ByteCounter - 1
		utf8Character << 8
		utf8Character + ReadByte(fPtr)
	Wend
	
	ProcedureReturn utf8Character
EndProcedure
...and allows a ReadChar() with format-dependant length:

Code: Select all

Procedure ReadChar(fh.i, StrFrmt.i)
	Select StrFrmt
		Case #PB_Ascii, 0
			ProcedureReturn ReadAsciiCharacter(fh)
		Case #PB_Unicode
			ProcedureReturn ReadUnicodeCharacter(fh)
		Case #PB_UTF8
			ProcedureReturn ReadUTF8Character(fh)
	EndSelect
	ProcedureReturn 0
EndProcedure

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 8:44 am
by wilbert
Unfortunately reading an utf8 character it isn't as simple as shifting a byte 8 bits to the left and appending the next one.

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 9:10 am
by infratec
Hi,

my small contribution:

Code: Select all

CompilerIf #PB_Compiler_IsMainFile
  EnableExplicit
CompilerEndIf




Procedure.i UTF8Size(FirstByte.a)
  !movzx eax, byte [p.v_FirstByte]
  !shr al, 3
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
    !lea rdx, [utf8size0]
    !mov al, [rdx + rax]
  CompilerElse
    !mov al, [utf8size0 + eax]
  CompilerEndIf
  
  ProcedureReturn
  
  !utf8size0:
  !db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
  !db 0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,0
  
EndProcedure




Procedure.i ReadUTF8Character(fPtr.i)
      
   Protected utf8Character.a, utf8Buffer$, ByteCounter.i, ptr.i
   
   
   If Eof(fPtr)                  ; Make sure we have something to read
     ProcedureReturn 0
   EndIf
   
   utf8Buffer$ = Space(4)
   ; Read first byte
   utf8Character = ReadByte(fPtr)
   If utf8Character = 0            ; an error
      ProcedureReturn 0
   EndIf
   
   PokeA(@utf8Buffer$ + ptr, utf8Character)
   ptr + 1
   
   ByteCounter = UTF8Size(utf8Character)
   
   ; Read the rest of the character
   While ByteCounter > 1
      If Eof(fPtr)               ; verify another byte exists.
         ProcedureReturn 0
      EndIf
      ByteCounter - 1
      PokeA(@utf8Buffer$ + ptr, ReadByte(fPtr))
      ptr + 1
   Wend
   
   ProcedureReturn Asc(PeekS(@utf8Buffer$, -1, #PB_UTF8))
EndProcedure




Procedure.i ReadChar(fh.i, StrFrmt.i)
  
  Select StrFrmt
    Case #PB_Ascii, 0
      ProcedureReturn ReadAsciiCharacter(fh)
    Case #PB_Unicode
      ProcedureReturn ReadUnicodeCharacter(fh)
    Case #PB_UTF8
      ProcedureReturn ReadUTF8Character(fh)
  EndSelect
  
  ProcedureReturn 0
  
EndProcedure






CompilerIf #PB_Compiler_IsMainFile
  
  Define.i File, FileFormat
  Define Filename$
  
  
  Filename$ = OpenFileRequester("Choose a text file ...", "", "All|*.*", 0)
  If Filename$
    File = ReadFile(#PB_Any, Filename$)
    If File
      FileFormat = ReadStringFormat(File)
      While Not Eof(File)
        Debug Chr(ReadChar(File, FileFormat))
      Wend
      CloseFile(File)
    EndIf
  EndIf
  
CompilerEndIf
Bernd

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 10:17 am
by wilbert
Here's my attempt.
When reading/peeking every byte is validated. This has a little speed impact but hopefully it still is fast enough.
Supported character range : 0 - $10FFFF

BytesReadFromMemory = UTF8_PeekC(*MemoryBuffer, *Character.Long)

BytesWrittenToMemory = UTF8_PokeC(*MemoryBuffer, Character.l)

Character = ReadUTF8Character(File.i); Returns -1 in case of an error

BytesWrittenToFile = WriteUTF8Character(File.i, Character.l)

Code: Select all

CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
  Macro UTF8_MovFromMem_(offset, reg)
    !mov reg, [rdx + offset]
  EndMacro
  Macro UTF8_MovToMem_(offset, reg)
    !mov [rdx + offset], reg
  EndMacro
  Macro UTF8_MovToChar_()
    !mov [rax], ecx
  EndMacro
CompilerElse
  Macro UTF8_MovFromMem_(offset, reg)
    !mov reg, [edx + offset]
  EndMacro  
  Macro UTF8_MovToMem_(offset, reg)
    !mov [edx + offset], reg
  EndMacro
  Macro UTF8_MovToChar_()
    !mov [eax], ecx
  EndMacro
CompilerEndIf

Procedure.i UTF8_PeekC(*MemoryBuffer, *Character.Long)
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
    !mov rdx, [p.p_MemoryBuffer]
    !mov rax, [p.p_Character]
  CompilerElse
    !mov edx, [p.p_MemoryBuffer]
    !mov eax, [p.p_Character]
  CompilerEndIf
  UTF8_MovFromMem_(0, cl)
  !cmp cl, 0xc0
  !jl utf8peekc3
  !jnb utf8peekc0
  !and ecx, 0x7f
  UTF8_MovToChar_()
  ProcedureReturn 1
  !utf8peekc0:
  !cmp cl, 0xe0
  !jnb utf8peekc1
  UTF8_MovFromMem_(1, ch)
  !cmp ch, 0xc0
  !jge utf8peekc3
  !shl ch, 2
  !rol cx, 6
  !and ecx, 0x7ff
  UTF8_MovToChar_()
  ProcedureReturn 2
  !utf8peekc1:
  !cmp cl, 0xf0
  !jnb utf8peekc2
  !shl ecx, 16
  UTF8_MovFromMem_(1, cx)
  !cmp cl, 0xc0
  !jge utf8peekc3
  !cmp ch, 0xc0
  !jge utf8peekc3  
  !xchg ch, cl
  !shl cl, 2
  !shl cx, 2
  !shr ecx, 4
  !and ecx, 0xffff
  UTF8_MovToChar_()
  ProcedureReturn 3
  !utf8peekc2:
  !cmp cl, 0xf4
  !ja utf8peekc3
  UTF8_MovFromMem_(0, ecx)
  !cmp ch, 0xc0
  !jge utf8peekc3
  !shl ch, 2
  !bswap ecx
  !cmp cl, 0xc0
  !jge utf8peekc3
  !cmp ch, 0xc0
  !jge utf8peekc3  
  !shl cl, 2
  !shr ecx, 2
  !shl cx, 4
  !shr ecx, 4
  !and ecx, 0x1ffff
  UTF8_MovToChar_()
  ProcedureReturn 4
  !utf8peekc3:
  ProcedureReturn 0
EndProcedure

Procedure.i UTF8_PokeC(*MemoryBuffer, Character.l)
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
    !mov rdx, [p.p_MemoryBuffer]
  CompilerElse
    !mov edx, [p.p_MemoryBuffer]
  CompilerEndIf
  !mov ecx, [p.v_Character]
  !cmp ecx, 0x7f
  !ja utf8pokec0
  UTF8_MovToMem_(0, cl)
  ProcedureReturn 1
  !utf8pokec0:
  !cmp ecx, 0x7ff
  !ja utf8pokec1
  !shl cx, 2
  !shr cl, 2
  !or cx, 1100000010000000b
  !xchg ch, cl 
  UTF8_MovToMem_(0, cx)
  ProcedureReturn 2
  !utf8pokec1:
  !cmp ecx, 0xffff
  !ja utf8pokec2
  !shl ecx, 4
  !shr cx, 2
  !shr cl, 2
  !or ecx, 111000001000000010000000b
  !bswap ecx
  UTF8_MovToMem_(0, ch)
  !shr ecx, 16
  UTF8_MovToMem_(1, cx)
  ProcedureReturn 3
  !utf8pokec2:
  !cmp ecx, 0x10ffff
  !ja utf8pokec3
  !ror ecx, 10
  !shl ch, 2
  !rol ecx, 14
  !shr cx, 2
  !shr cl, 2
  !or ecx, 11110000100000001000000010000000b
  !bswap ecx
  UTF8_MovToMem_(0, ecx)
  ProcedureReturn 4
  !utf8pokec3:
  ProcedureReturn 0
EndProcedure

Procedure.i ReadUTF8Character(File.i)
  Protected.i c, size, l = $80808080
  ReadData(File, @l, 1)
  size = UTF8_PeekC(@l, @c)
  If size = 1
    ProcedureReturn c
  ElseIf size > 1
    ReadData(File, @l + 1, size - 1)
    If UTF8_PeekC(@l, @c)
      ProcedureReturn c
    EndIf
  EndIf
  ProcedureReturn -1
EndProcedure

Procedure.i WriteUTF8Character(File.i, Character.l)
  ProcedureReturn WriteData(File, @Character, UTF8_PokeC(@Character, Character))
EndProcedure
Example of dumping the character values of an utf-8 string

Code: Select all

*Mem = AllocateMemory(1024)
PokeS(*Mem, "€1,- €2,- €3,50", -1, #PB_UTF8)
Repeat
  cnt.l = UTF8_PeekC(*Mem, @c.l)
  Debug Hex(c) + " " + Chr(c)
  *Mem + cnt
Until cnt = 0 Or c = 0

Edit: I updated the ReadUTF8Character(File.i) procedure with a simplified version

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 2:59 pm
by Tenaja
wilbert wrote:Unfortunately reading an utf8 character it isn't as simple as shifting a byte 8 bits to the left and appending the next one.
I was looking at this C sample and rushed through it and got it backwards.
http://zaemis.blogspot.com/2011/06/read ... -in-c.html

This one reverses the order of the original, using ShiftCounter:

Code: Select all

Procedure.i ReadUTF8Character(fPtr.i)
	Protected.i utf8Character
	Protected.i ByteCounter, ShiftCounter 
	
	If Eof(fPtr)
		ProcedureReturn 0
	EndIf
	
	; Read first byte
	utf8Character = ReadByte(fPtr)
	If utf8Character = 0			; an error
		ProcedureReturn 0
	EndIf
	
	; check how many more bytes need to be read for character
	ByteCounter = UTF8Size(utf8Character)
	
	; Read subsequent character bytes
	ShiftCounter = 1
	While ByteCounter > 1
		If Eof(fPtr)				; verify the required byte exists.
			ProcedureReturn 0
		EndIf
		
		utf8Character + (ReadByte(fPtr) << (8 * ShiftCounter))
		ByteCounter - 1
		ShiftCounter + 1
	Wend
	
	; Return value read
	ProcedureReturn utf8Character
EndProcedure
It is more tricky without any actual non-ascii characters to test on...

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 3:14 pm
by wilbert
Tenaja wrote:I was looking at this C sample and rushed through it and got it backwards.
http://zaemis.blogspot.com/2011/06/read ... -in-c.html

This one reverses the order of the original, using ShiftCounter:
Did you test it on an actual utf-8 file ?
Here's an example of a test file I encountered
http://www.cl.cam.ac.uk/~mgk25/ucs/exam ... 8-demo.txt

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 3:40 pm
by Tenaja
wilbert wrote:Here's my attempt.
I like your use of macros. I made a feeble attempt at using a macro to replace eax/rax (etc.), but it did not work, either in pb or fasm. I was trying to just replace the register name, though. Since it was just for fun to retrofit sample code, I did not put much focus on it. Replacing the whole line is a neat idea.

This seems to work with a quick test outputting to a text file. You could duplicate Xdx for Xax, etc, and eliminate all of the conditional compilation from within the procs.

Code: Select all

CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
	Macro Xdx
		rdx
	EndMacro
CompilerElse
	Macro Xdx
		edx
	EndMacro
CompilerEndIf

CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
	Macro MoveToXdx(Xdx, Source)
		!mov Xdx, Source
	EndMacro
CompilerElse
	Macro MoveToXdx(Xdx, Source)
		!mov Xdx, Source
	EndMacro
CompilerEndIf

; Use:
MoveToXdx (Xdx, [p.p_MemoryBuffer])
You can carry it a step further, and just do the whole 3-word instruction, so you only need one macro for all instructions, plus one for each e/r register pair:

Code: Select all

CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
	Macro Xdx
		rdx
	EndMacro
	Macro Xax
		rax
	EndMacro
CompilerElse
	Macro Xdx
		edx
	EndMacro
	Macro Xax
		eax
	EndMacro
CompilerEndIf

Macro asm(instruction, arg1, arg2)
	!instruction arg1, arg2
EndMacro

asm(MOV, xdx, [p.p_MemoryBuffer])
asm(MOV, xax, [p.p_Character])
Your code using these new macros:

Code: Select all

CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
	Macro UTF8_MovFromMem_(offset, reg)
		!mov reg, [rdx + offset]
	EndMacro
	Macro UTF8_MovToMem_(offset, reg)
		!mov [rdx + offset], reg
	EndMacro
	Macro UTF8_MovToChar_()
		!mov [rax], ecx
	EndMacro
CompilerElse
	Macro UTF8_MovFromMem_(offset, reg)
		!mov reg, [edx + offset]
	EndMacro 
	Macro UTF8_MovToMem_(offset, reg)
		!mov [edx + offset], reg
	EndMacro
	Macro UTF8_MovToChar_()
		!mov [eax], ecx
	EndMacro
CompilerEndIf



CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
	Macro Xdx
		rdx
	EndMacro
	Macro Xax
		rax
	EndMacro
CompilerElse
	Macro Xdx
		edx
	EndMacro
	Macro Xax
		eax
	EndMacro
CompilerEndIf

Macro asm(instruction, arg1, arg2)
	!instruction arg1, arg2
EndMacro



Procedure.i UTF8_Size(FirstByte.a)
	!movzx eax, byte [p.v_FirstByte]
	!shr al, 3
	CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
		!lea r8, [utf8size0]
		!mov al, [r8 + rax]
	CompilerElse
		!mov al, [utf8size0 + eax]
	CompilerEndIf
	ProcedureReturn
	!utf8size0:
	!db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
	!db 0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,0
EndProcedure



Procedure.i UTF8_PeekC(*MemoryBuffer, *Character.Long)
	asm(MOV, xdx, [p.p_MemoryBuffer])
	asm(MOV, xax, [p.p_Character])
	UTF8_MovFromMem_(0, cl)
	!cmp cl, 0xc0
	!jl utf8peekc3
	!jnb utf8peekc0
	!and ecx, 0x7f
	UTF8_MovToChar_()
	ProcedureReturn 1
	!utf8peekc0:
	!cmp cl, 0xe0
	!jnb utf8peekc1
	UTF8_MovFromMem_(1, ch)
	!cmp ch, 0xc0
	!jge utf8peekc3
	!shl ch, 2
	!rol cx, 6
	!and ecx, 0x7ff
	UTF8_MovToChar_()
	ProcedureReturn 2
	!utf8peekc1:
	!cmp cl, 0xf0
	!jnb utf8peekc2
	!shl ecx, 16
	UTF8_MovFromMem_(1, cx)
	!cmp cl, 0xc0
	!jge utf8peekc3
	!cmp ch, 0xc0
	!jge utf8peekc3 
	!xchg ch, cl
	!shl cl, 2
	!shl cx, 2
	!shr ecx, 4
	!and ecx, 0xffff
	UTF8_MovToChar_()
	ProcedureReturn 3
	!utf8peekc2:
	!cmp cl, 0xf4
	!ja utf8peekc3
	UTF8_MovFromMem_(0, ecx)
	!cmp ch, 0xc0
	!jge utf8peekc3
	!shl ch, 2
	!bswap ecx
	!cmp cl, 0xc0
	!jge utf8peekc3
	!cmp ch, 0xc0
	!jge utf8peekc3 
	!shl cl, 2
	!shr ecx, 2
	!shl cx, 4
	!shr ecx, 4
	!and ecx, 0x1ffff
	UTF8_MovToChar_()
	ProcedureReturn 4
	!utf8peekc3:
	ProcedureReturn 0
EndProcedure



Procedure.i UTF8_PokeC(*MemoryBuffer, Character.l)
	asm(MOV, xdx, [p.p_MemoryBuffer])
	!mov ecx, [p.v_Character]
	!cmp ecx, 0x7f
	!ja utf8pokec0
	UTF8_MovToMem_(0, cl)
	ProcedureReturn 1
	!utf8pokec0:
	!cmp ecx, 0x7ff
	!ja utf8pokec1
	!shl cx, 2
	!shr cl, 2
	!or cx, 1100000010000000b
	!xchg ch, cl
	UTF8_MovToMem_(0, cx)
	ProcedureReturn 2
	!utf8pokec1:
	!cmp ecx, 0xffff
	!ja utf8pokec2
	!shl ecx, 4
	!shr cx, 2
	!shr cl, 2
	!or ecx, 111000001000000010000000b
	!bswap ecx
	UTF8_MovToMem_(0, ch)
	!shr ecx, 16
	UTF8_MovToMem_(1, cx)
	ProcedureReturn 3
	!utf8pokec2:
	!cmp ecx, 0x10ffff
	!ja utf8pokec3
	!ror ecx, 10
	!shl ch, 2
	!rol ecx, 14
	!shr cx, 2
	!shr cl, 2
	!or ecx, 11110000100000001000000010000000b
	!bswap ecx
	UTF8_MovToMem_(0, ecx)
	ProcedureReturn 4
	!utf8pokec3:
	ProcedureReturn 0
EndProcedure



Procedure.i ReadUTF8Character(File.i)
	Protected.i c, size, l = $80808080
	ReadData(File, @l, 1)
	size = UTF8_PeekC(@l, @c)
	If size = 1
		ProcedureReturn c
	ElseIf size > 1
		ReadData(File, @l + 1, size - 1)
		If UTF8_PeekC(@l, @c)
			ProcedureReturn c
		EndIf
	EndIf
	ProcedureReturn -1
EndProcedure



Procedure.i WriteUTF8Character(File.i, Character.l)
	ProcedureReturn WriteData(File, @Character, UTF8_PokeC(@Character, Character))
EndProcedure





*Mem = AllocateMemory(1024)
PokeS(*Mem, "€1,- €2,- €3,50", -1, #PB_UTF8)
Repeat
	cnt.l = UTF8_PeekC(*Mem, @c.l)
	Debug Hex(c) + " " + Chr(c)
	*Mem + cnt
Until cnt = 0 Or c = 0
The one place it still requires the conditional compilation is when there are two x64 instructions, and only one x86. (I am not familiar enough with x86 & x64 asm to know why it is not 1:1.)

Re: ReadChar(fH, Format) and ReadUTF8Character()

Posted: Fri Aug 15, 2014 3:47 pm
by wilbert
@Tenaja, I updated my code.
Especially the ReadUTF8Character(File.i) is simplified and faster now. You might want to update it in the code you posted with your macro code.
The one you couldn't replace has to do with memory addressing. Using a label directly the way the x86 code does isn't allowed for x64.