Page 1 of 1

Read text files (all formats)

Posted: Wed Mar 26, 2025 7:14 pm
by Michael Vogel
Hi, I wrote a purebasic tool to search quickly within source files (or other text files) which uses the command ReadString().
Badly, some text file formats aren't support so I did the following code as a quick workaround for big endian formatted files.

This solution does need to allocate memory two times (Reallocate and PeekS), I thought to read the file directly into the string but then I'd need to use something like Buffer=Space(FileSize>>#PB_Compiler_Unicode) which is slower than just allocating memory. Any other ideas without eliminating the string by just using a memory buffer?

Maybe also the byte swapping could be improved, any ideas?

Code: Select all

#File=1

FileName.s="test.txt"
FileSize=FileSize(FileName)
Buffer.s

If ReadFile(#File,FileName,#PB_File_SharedRead)
	m=ReadStringFormat(#File); BOM (Byte Order Mark)
	; Debug Str(m)+": "+Files(Tool\CheckFiles)\Name
	If m%24>=#PB_UTF16BE
		If m=#PB_UTF16BE

			BufferSize=FileSize
			*Buffer=ReAllocateMemory(*Buffer,BufferSize,#PB_Memory_NoClear)

			If *Buffer
				BufferSize=ReadData(0,*Buffer,BufferSize)
				; ShowMemoryViewer(*Buffer,BufferSize)
				m=BufferSize
				If m
					*BufferA=*Buffer
					*BufferB=*Buffer+1

					While m
						n=PeekA(*BufferA)
						PokeA(*BufferA,PeekA(*BufferB))
						PokeA(*BufferB,n)
						m-2
						*BufferA+2
						*BufferB+2
					Wend
					; ShowMemoryViewer(*Buffer,BufferSize)
					Buffer=PeekS(*Buffer,BufferSize>>1)
				Else
					Buffer=""
				EndIf
			Else
				Buffer=""
			EndIf

		Else
			Debug "PANIC - Filetype not supported"
			Buffer=""
		EndIf
	Else
		Buffer=ReadString(#File,m|#PB_File_IgnoreEOL)
	EndIf
	CloseFile(#File)
EndIf

Re: Read text files (all formats)

Posted: Thu Mar 27, 2025 1:21 pm
by AZJIO
UTF-8 without BOM is not supported?

Re: Read text files (all formats)

Posted: Thu Mar 27, 2025 9:57 pm
by Michael Vogel
Quite sure that some formats won't work - that's why I am asking for help :lol:

The following modification allows to test different text formats, here are three data lines (Ascii, UTF16BE, UTF16LE) which are working...

Code: Select all


DataSection
	Start:
	Data.a 84,104,101,32,81,117,105,99,107,32,66,114,111,119,110,32,70,111,120,46,46,46,32,228,246,252,223,128
	;Data.a 254,255,0,84,0,104,0,101,0,32,0,81,0,117,0,105,0,99,0,107,0,32,0,66,0,114,0,111,0,119,0,110,0,32,0,70,0,111,0,120,0,46,0,46,0,46,0,32,0,228,0,246,0,252,0,223,32,172
	;Data.a 255,254,84,0,104,0,101,0,32,0,81,0,117,0,105,0,99,0,107,0,32,0,66,0,114,0,111,0,119,0,110,0,32,0,70,0,111,0,120,0,46,0,46,0,46,0,32,0,228,0,246,0,252,0,223,0,172,32
	Stop:
EndDataSection

#File=1
FileName.s="test.txt"

CreateFile(#File,FileName)
WriteData(#File,?Start,?Stop-?Start)
CloseFile(#File)


FileSize=FileSize(FileName)
Buffer.s

If ReadFile(#File,FileName,#PB_File_SharedRead)
	m=ReadStringFormat(#File); BOM (Byte Order Mark)
	; Debug Str(m)+": "+Files(Tool\CheckFiles)\Name
	If m%24>=#PB_UTF16BE
		If m=#PB_UTF16BE

			BufferSize=FileSize
			*Buffer=ReAllocateMemory(*Buffer,BufferSize,#PB_Memory_NoClear)

			If *Buffer
				BufferSize=ReadData(#File,*Buffer,BufferSize)
				; ShowMemoryViewer(*Buffer,BufferSize)
				m=BufferSize
				If m
					*BufferA=*Buffer
					*BufferB=*Buffer+1

					While m
						n=PeekA(*BufferA)
						PokeA(*BufferA,PeekA(*BufferB))
						PokeA(*BufferB,n)
						m-2
						*BufferA+2
						*BufferB+2
					Wend
					; ShowMemoryViewer(*Buffer,BufferSize)
					Buffer=PeekS(*Buffer,BufferSize>>1)
				Else
					Buffer=""
				EndIf
			Else
				Buffer=""
			EndIf

		Else
			Debug "PANIC - Filetype not supported"
			Buffer=""
		EndIf
	Else
		Buffer=ReadString(#File,m|#PB_File_IgnoreEOL)
	EndIf
	CloseFile(#File)

	Debug "'"+Buffer+"'"
EndIf

Re: Read text files (all formats)

Posted: Fri Mar 28, 2025 2:30 pm
by mk-soft
Convert inplace ...

Code: Select all

Procedure bswap16(value.u)
  CompilerIf #PB_Compiler_Backend = #PB_Backend_C
    !return __builtin_bswap16(v_value);
  CompilerElse
    !xor eax,eax
    !mov ax, word [p.v_value]
    !rol ax, 8
    ProcedureReturn
  CompilerEndIf
EndProcedure

DataSection
	Start:
	;Data.a 84,104,101,32,81,117,105,99,107,32,66,114,111,119,110,32,70,111,120,46,46,46,32,228,246,252,223,128
	Data.a 254,255,0,84,0,104,0,101,0,32,0,81,0,117,0,105,0,99,0,107,0,32,0,66,0,114,0,111,0,119,0,110,0,32,0,70,0,111,0,120,0,46,0,46,0,46,0,32,0,228,0,246,0,252,0,223,32,172
	;Data.a 255,254,84,0,104,0,101,0,32,0,81,0,117,0,105,0,99,0,107,0,32,0,66,0,114,0,111,0,119,0,110,0,32,0,70,0,111,0,120,0,46,0,46,0,46,0,32,0,228,0,246,0,252,0,223,0,172,32
	Data.w 0
	Stop:
EndDataSection

Structure ArrayOfUnicode
  u.u[0]
EndStructure

Procedure ConvertBE(*Mem.ArrayOfUnicode, size)
  Protected index, max
  
  max = size >> 1
  
  For index = 0 To max
    *mem\u[index] = bswap16(*mem\u[index])
  Next
EndProcedure

size = ?stop - ?start - 2
ConvertBE(?start + 2, size)

text.s = PeekS(?start + 2)
Debug text

Re: Read text files (all formats)

Posted: Fri Mar 28, 2025 4:34 pm
by Michael Vogel
Thanks, infratec had a similar idea which seems to be even faster (see code below)...

Code: Select all

Procedure bswap16(value.u)
	CompilerIf #PB_Compiler_Backend = #PB_Backend_C
		!return __builtin_bswap16(v_value);
	CompilerElse
		!xor eax,eax
		!mov ax, word [p.v_value]
		!rol ax, 8
		ProcedureReturn
	CompilerEndIf
EndProcedure

Macro x()
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
EndMacro
Macro xx()
	Data.q x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() x() 0
EndMacro
Macro xxx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
	xx()
EndMacro

DataSection
	Start:
	;Data.a 84,104,101,32,81,117,105,99,107,32,66,114,111,119,110,32,70,111,120,46,46,46,32,228,246,252,223,128
	Data.a 254,255,0,84,0,104,0,101,0,32,0,81,0,117,0,105,0,99,0,107,0,32,0,66,0,114,0,111,0,119,0,110,0,32,0,70,0,111,0,120,0,46,0,46,0,46,0,32,0,228,0,246,0,252,0,223,32,172
	;Data.a 255,254,84,0,104,0,101,0,32,0,81,0,117,0,105,0,99,0,107,0,32,0,66,0,114,0,111,0,119,0,110,0,32,0,70,0,111,0,120,0,46,0,46,0,46,0,32,0,228,0,246,0,252,0,223,0,172,32
	xxx()
	Stop:
EndDataSection

#File=1
FileName.s="test.txt"

CreateFile(#File,FileName)
WriteData(#File,?Start,?Stop-?Start)
CloseFile(#File)


Structure BufferType
	StructureUnion
		a.a[0]
		c.c[0]
		u.u[0]
	EndStructureUnion
EndStructure

FileSize=FileSize(FileName)
*Buffer.BufferType
Buffer.s

If ReadFile(#File,FileName,#PB_File_SharedRead)
	m=ReadStringFormat(#File); BOM (Byte Order Mark)
	; Debug Str(m)+": "+Files(Tool\CheckFiles)\Name
	If m%24>=#PB_UTF16BE
		If m=#PB_UTF16BE
			Debug "Go..."

			BufferSize=FileSize
			*Buffer=ReAllocateMemory(*Buffer,BufferSize,#PB_Memory_NoClear)

			If *Buffer
				BufferSize=ReadData(#File,*Buffer,BufferSize)
				; ShowMemoryViewer(*Buffer,BufferSize)
				m=BufferSize

				If m And m&1=0
					t-ElapsedMilliseconds()

					CompilerIf 1
						While m
							m-2
							Swap *Buffer\a[m], *Buffer\a[m+1]
						Wend
						Buffer=PeekS(*Buffer,BufferSize>>1)

					CompilerElse
						m = BufferSize>>1
						; ShowMemoryViewer(*Buffer,BufferSize)
						For index = 0 To m
							*Buffer\u[index]=bswap16(*Buffer\u[index])
							; ShowMemoryViewer(*Buffer,20)
						Next
						Buffer=PeekS(*Buffer,BufferSize>>1)

					CompilerEndIf
			
					t+ElapsedMilliseconds()
		
				Else
					Buffer=""
				EndIf

			Else
				Buffer=""
			EndIf

			Debug t

		Else
			Debug "PANIC - Filetype not supported"
			Buffer=""
		EndIf
	Else
		Buffer=ReadString(#File,m|#PB_File_IgnoreEOL)
	EndIf
	CloseFile(#File)


	Debug "'"+Buffer+"'"
EndIf
PS: The tool using this code can be downloaded here, it does search for keywords located in different lines of your source code files. If you want to use this program from inside the purebasic editor, you should set the arguments within the tool configuration to "%home\..\Source" %word .