Fix case sensitivity in path for html files

Just starting out? Need help? Post your questions and find answers here.
AZJIO
Addict
Addict
Posts: 2223
Joined: Sun May 14, 2017 1:48 am

Fix case sensitivity in path for html files

Post by AZJIO »

I was trying to fix case-sensitive paths in file and folder names. If we extract the CHM file, we will see the paths in different registers. My task is to make the letter case in the text the same as the real files. For example

Code: Select all

<a href="../2ddrawing/index.html">Drawing operations</a>
The real path: 2DDrawing\AlphaBlend.html
Using the ScanDir() function, I get the tree structure. Next, I need to send the path to the array, dividing the string by "/". Each element of the path has an attachment level. I'm trying to find a folder with the same case-insensitive attachment level, and if it is found and does not match the case, then I replace it with the correct name. I also need to take into account the relative paths "../".
I'm a little confused right now, but it's been too long since I've tried to solve the problem I need for the chmViewer program. But I think this is a more universal feature, as it will help to fix any help so that it can be uploaded to a server that also uses case-sensitive file names.

Code: Select all

EnableExplicit

#q$ = Chr(34)
; #q$ = "`"

;- ● Enumeration
Enumeration File
	#File
EndEnumeration


Enumeration RegExp
	#RegExpFixPathTOC
	#RegExpFixPathHTML
EndEnumeration

Global dir0$ = "C:\html\"
Global pathTOS$ = "Table of Contents"

Structure Path2
	path.s
	pathLCase.s
	file.s
	fileLCase.s
EndStructure

Structure Files
    Name.s ; File name
    NameL.s ; File name (lowercase)
EndStructure

Structure Tree
    DirName.s ; Folder name
    DirNameL.s ; Folder name (lowercase)
    List Files.Files() ; List of files in the current folder.
    List Dirs.Tree() ; List of subfolders of the current folder.
EndStructure


Procedure ScanDir(*s.Tree, AllDir.s) ; Save the structure of the specified folder
    Protected id, Dir.s, ext.s

    If *s = 0
        ProcedureReturn 0
    EndIf

    id = ExamineDirectory(#PB_Any, AllDir, "*.*")
    If id
        While NextDirectoryEntry(id)
            If DirectoryEntryType(id) = #PB_DirectoryEntry_Directory ; Folder
                Dir = DirectoryEntryName(id)
                If Dir <> "." And Dir <> ".."
                    If AddElement(*s\Dirs())
                        *s\Dirs()\DirName = Dir
                        *s\Dirs()\DirNameL = LCase(Dir)
                        ScanDir(*s\Dirs(), AllDir + Dir + #PS$) ; Recursive procedure call
                    EndIf
                EndIf
            Else ; File
            	ext = GetExtensionPart(DirectoryEntryName(id))
            	If (ext = "html" Or ext = "htm") And AddElement(*s\Files()) ; checking that the file type/extension is html
                    *s\Files()\Name = DirectoryEntryName(id)
                    *s\Files()\NameL = LCase(DirectoryEntryName(id))
            	EndIf
            EndIf
        Wend
        FinishDirectory(id)
    EndIf

    ProcedureReturn
EndProcedure


Procedure ClearFileTree(*s.Tree) ; Clear the structure previously obtained by the ScanDir function

    If *s = 0
        ProcedureReturn 0
    EndIf
    
    FreeList(*s\Files()) ; delete file lists
    If ListSize(*s\Dirs())
    	ForEach *s\Dirs()
    		ClearFileTree(*s\Dirs()) ; Recursive procedure call
    	Next
    	FreeList(*s\Dirs()) ; delete folder lists after deleting its contents
    EndIf

    ProcedureReturn
EndProcedure


Procedure SplitA2(String$, Array StringList.s(1), Separator$ = #CRLF$ + #TAB$ + #FF$ + #VT$ + " ")
	Protected *S = @String$
	Protected *jc.Character, *c.Character = @String$
	Protected i
	
	i = 0
	While *c\c
		*jc = @Separator$
		
		While *jc\c
			If *c\c = *jc\c
				*c\c = 0
				If *S <> *c
					ReDim StringList(i)
					StringList(i) = PeekS(*S)
					i + 1
				EndIf
				*S = *c + SizeOf(Character)
				Break
			EndIf
			*jc + SizeOf(Character)
		Wend
		
		*c + SizeOf(Character)
	Wend
	ReDim StringList(i)
	StringList(i) = PeekS(*S)
EndProcedure


Procedure TestPath(*s.Tree, *folder.String, IsFile = 0)
	If IsFile
		ForEach *s\Files()
			If *s\Files()\NameL = LCase(*folder\s) ; if they are equal in lowercase, then
				If *s\Files()\Name <> *folder\s	   ; if they are NOT equal in uppercase, then
					*folder\s = *s\Files()\Name
					ProcedureReturn
					Break
				EndIf
			EndIf
		Next
	Else
		ForEach *s\Dirs()
			If *s\Dirs()\DirNameL = LCase(*folder\s) ; if they are equal in lowercase, then
				If *s\Dirs()\DirName <> *folder\s	   ; if they are NOT equal in uppercase, then
					*folder\s = *s\Dirs()\DirName
					ProcedureReturn @*s\Dirs() ; return the pointer to the list element
				   ; ProcedureReturn *s\Dirs()\DirName ; return the pointer to the first element of the structure, that is, to the structure itself
				   ; ReplaceString(Text$, RegularExpressionMatchString(#RegExpFixPathTOC), Files2()\path, #PB_String_InPlace, RegularExpressionMatchPosition(#RegExpFixPathTOC), 1)
				   ; Debug "replacement in position: " + RegularExpressionMatchPosition(#RegExpFixPathTOC)
					Break
				EndIf
			EndIf
		Next
	EndIf
	ProcedureReturn 0 ; return 0 if nothing is found.
EndProcedure

Procedure.s ReadFileToVar(Path$)
	Protected id_file, Format, Text$

	id_file = ReadFile(#PB_Any, Path$)
	If id_file
		Format = ReadStringFormat(id_file)
		Text$ = ReadString(id_file, Format | #PB_File_IgnoreEOL)
		; 	Text$ = ReadString(id_file, #PB_UTF8 | #PB_File_IgnoreEOL)
		CloseFile(id_file)
	EndIf

	ProcedureReturn Text$
EndProcedure


Procedure FixPath(AndHTML = 0)
	Protected NewList Files.s()
	Protected NewList Files2.Path2()
	Protected length, Format, pathTosTotal$, Text$, pathLCase$, htmpath$, *s, i, *tmp, newPath$
	Protected Dim pathLCase.s(0)
	
	length = Len(dir0$) + 1
	; for Linux, it is necessary to generate html paths in lowercase to find if the path is incorrect.
	
	Protected s.Tree ; Creating an instance of a structure.
	
; If the path to the folder has text and exists, being the correct path, then
	If Asc(dir0$) And FileSize(dir0$) = -2
		s\DirName = dir0$
; scan the folder
		ScanDir(s, s\DirName) ; scan getting a tree of files and folders
		ClearFileTree(s)
	EndIf
	
; Fix paths in hhc
	pathTosTotal$ = dir0$ + pathTOS$ + ".hhc"
; 	Text$ = OpenFileToGadget(pathTosTotal$, @Format) ; read hhc
	Text$ = ReadFileToVar(pathTosTotal$) ; read hhc
	If Format <> #PB_Ascii
		Format = #PB_UTF8 ; do not give permission to other formats so as not to fail.
	EndIf
	
; we get all the paths in the hhc text
	If CreateRegularExpression(#RegExpFixPathTOC, "<param name=.Local. value=.\K[^\r\n:*?<>|]+]*?(?=.>)", #PB_RegularExpression_NoCase)
		If ExamineRegularExpression(#RegExpFixPathTOC, Text$)
			While NextRegularExpressionMatch(#RegExpFixPathTOC)
				If FileSize(dir0$ + RegularExpressionMatchString(#RegExpFixPathTOC)) < 0 ; if the found path does not exist (in Linux due to the case), then
; Debug "not found: " +RegularExpressionMatchString(#RegExpFixPathTOC)
					pathLCase$ = LCase(RegularExpressionMatchString(#RegExpFixPathTOC)) ; half a copy of the path
					If Asc(pathLCase$) = '\' Or Asc(pathLCase$) = '/'
						pathLCase$ = Mid(pathLCase$, 2) ; read from the second character
					EndIf
; check here that it does not start with "./"
					; start moving through the tree
					ReDim pathLCase(0) ; clear the array
					SplitA2(pathLCase$, pathLCase(), "\/") ; dividing the array into path elements
					*s = @s ; starting the immersion from the root
					For i = 0 To ArraySize(pathLCase())
						*tmp = @pathLCase(i)
						If i = ArraySize(pathLCase())
							TestPath(*s, @*tmp, 1) ; the last one is a file
						Else
							*s = TestPath(*s, @*tmp) ; we return the attached folder
						EndIf
						If *s = 0
							Break
						EndIf
						newPath$ + pathLCase(i) + #PS$
					Next
					newPath$ = RTrim(newPath$, #PS$)
; if the case-insensitive path contains the same path, then
					If FindString(newPath$, pathLCase$, #PB_String_NoCase, #PB_String_NoCase)
						Debug "путь не поломался"
					EndIf
					
					ForEach Files2()
						If Files2()\pathLCase = pathLCase$
							ReplaceString(Text$, RegularExpressionMatchString(#RegExpFixPathTOC), Files2()\path, #PB_String_InPlace, RegularExpressionMatchPosition(#RegExpFixPathTOC), 1)
							Debug "замена в позиции: " + RegularExpressionMatchPosition(#RegExpFixPathTOC)
							Break
						EndIf
					Next
				EndIf
			Wend
			If CreateFile(#File, dir0$ + "0.hhc", #PB_UTF8)
				WriteString(#File, Text$)
				CloseFile(#File)
			EndIf
		EndIf
		FreeRegularExpression(#RegExpFixPathTOC)
	EndIf
	If AndHTML
		Debug "——————————————————————"
		If CreateRegularExpression(#RegExpFixPathHTML, "(?<=\hhref=([" + #q$ + "']))[^\r\n<>|?*:]+?(?=\1)", #PB_RegularExpression_NoCase)
			ForEach Files()
					Files2()\file = GetPathPart(Files2()\path)
					Files2()\fileLCase = GetPathPart(Files2()\pathLCase)
			Next
			ForEach Files()
; 				Text$ = OpenFileToGadget(Files(), @Format)
				Text$ = ReadFileToVar(Files())
				If Format <> #PB_Ascii
					Format = #PB_UTF8 ; do not give permission to other formats so as not to fail.
				EndIf
				htmpath$ = GetPathPart(Files())
				If ExamineRegularExpression(#RegExpFixPathHTML, Text$)
					While NextRegularExpressionMatch(#RegExpFixPathHTML)
						; ignore paths starting with a dot
						If Asc(RegularExpressionMatchString(#RegExpFixPathHTML)) <> '.' And FileSize(htmpath$ + RegularExpressionMatchString(#RegExpFixPathHTML)) < 0 ; if the found path does not exist, then
							Debug "не найден: " + RegularExpressionMatchString(#RegExpFixPathHTML)
							pathLCase$ = LCase(RegularExpressionMatchString(#RegExpFixPathHTML))
							ForEach Files2()
								If Files2()\fileLCase = pathLCase$
									ReplaceString(Text$, RegularExpressionMatchString(#RegExpFixPathHTML), Files2()\file, #PB_String_InPlace, RegularExpressionMatchPosition(#RegExpFixPathHTML), 1)
									Debug "replacement in position: " + RegularExpressionMatchPosition(#RegExpFixPathHTML)
									Break
								EndIf
							Next
						EndIf
					Wend
				EndIf
				If CreateFile(#File, Files(), Format)
					WriteString(#File, Text$)
					CloseFile(#File)
				EndIf
			Next
			
			FreeRegularExpression(#RegExpFixPathHTML)
		Else
			Debug "Error"
		EndIf
		
		MessageRequester("", "")
	EndIf
; 		If Asc(filepaths1$) And Asc(filepaths2$)
; 			If CreateFile(#File, dir0$ + "f1.txt", #PB_UTF8)
; 				WriteString(#File , filepaths1$)
; 				CloseFile(#File)
; 			EndIf
; 			If CreateFile(#File, dir0$ + "f2.txt", #PB_UTF8)
; 				WriteString(#File , filepaths2$)
; 				CloseFile(#File)
; 			EndIf
; 		EndIf
EndProcedure
SMaag
Enthusiast
Enthusiast
Posts: 327
Joined: Sat Jan 14, 2023 6:55 pm
Location: Bavaria/Germany

Re: Fix case sensitivity in path for html files

Post by SMaag »

First I have to explain what I understand from your post.

1. You have a list with LoCase PathNames
2. Because on Linux a Path is CaseSensitive, it is possible it don't match with the LoCase version in the list!
3. Now you want to search for Pathes which match in LCase Characters to your list!
4.You want to change the Path in your List with thre real CaseSensitive version you found on disk!

Is this correct?

If this is correct, my approach would be a Hashtable of the LCase PathNames or PathParts for searching identical Names!
First I would try PB Map!
AZJIO
Addict
Addict
Posts: 2223
Joined: Sun May 14, 2017 1:48 am

Re: Fix case sensitivity in path for html files

Post by AZJIO »

That's right.
Deepseek proposed renaming all files and folders to the lower register, as well as all ways on the pages of HTML in the lower register. But this path is too simple, in addition, I like that the files are called in the right register and I would not want to make them all in the lower register. I myself knew this without Deepseek.

Suppose you want to get 3000 files and then compare 30,000 paths on this list. Each path of a long 20 characters. I propose to make a tree with 30 folders and 100 files inside, then first we will find a folder in 30 folders, and then a file of 100 files, as a result, we will make 130 comparisons of 10 characters. It will be 1000 times faster.
SMaag
Enthusiast
Enthusiast
Posts: 327
Joined: Sat Jan 14, 2023 6:55 pm
Location: Bavaria/Germany

Re: Fix case sensitivity in path for html files

Post by SMaag »

I created a Demo to list all Diretories in a List() and create a Map() with all LCase(DirecotoryNames)
and then compare all Directories of List with FindMapElement.
Just to see the timing! Maybe that will help you!

Here the Result for C.\Windows
--------------------------------------------------------
Time to list Directories = 10483ms
194270 Direcories found!
Time to crate HashTable = 1567ms
Time to check if all Lcase(Dir) is in Map = 1665ms
Matches found = 194270

Press any key to exit!
--------------------------------------------------------

for other Basdir with arround 30000 entries
--------------------------------------------------------
Time to list Directories = 1865ms
30648 Direcories found!
Time to crate HashTable = 34ms
Time to check if all Lcase(Dir) is in Map = 29ms
Matches found = 30648

Press any key to exit!
--------------------------------------------------------

Code: Select all


EnableExplicit
  
  Procedure.i ListDirectories(Dir$, List lstDirs.s(), Pattern$="", SearchInSubDirecotries=#False)
  ; ===========================================================================
  ; NAME : ListDirectories
  ; DESC : This is the Standard function to list the Directories
  ; DESC : This Version to list Directories is a forke of ListFiles  
  ; DESC : from PureBasic Forum
  ; VAR(Dir$) : Start Directory
  ; VAR(List lstDirs()) : List() to hold the FileNames
  ; VAR(Pattern$) : List only Files which matches with the Pattern$
  ; RET.i : Number of Files found
  ; =========================================================================== 
    Protected NewList tmp_lstDir.s(), hDir
    AddElement(tmp_lstDir())
    tmp_lstDir()=Dir$
    
    ClearList(lstDirs())
    
    While ListSize(tmp_lstDir())
      FirstElement(tmp_lstDir())
      Dir$=tmp_lstDir()
      
      hDir=ExamineDirectory(#PB_Any,Dir$, Pattern$)
      If hDir
        While NextDirectoryEntry(hDir)            
          If DirectoryEntryType(hDir)=#PB_DirectoryEntry_Directory
            Select DirectoryEntryName(hDir)
              Case ".", ".."
                ; ignore
              Default
                If SearchInSubDirecotries
                  ; add Directory to temporary Searchlist
                  AddElement(tmp_lstDir())
                  tmp_lstDir()=Dir$ + #PS$ + DirectoryEntryName(hDir)
                EndIf
                ; add Directory to the list of found directories
                AddElement(lstDirs())
                lstDirs()=Dir$ + #PS$ + DirectoryEntryName(hDir)
             EndSelect              
          EndIf        
        Wend
        FinishDirectory(hDir)
      EndIf
      FirstElement(tmp_lstDir())
      DeleteElement(tmp_lstDir())
    Wend
    
    ProcedureReturn ListSize(lstDirs())
  EndProcedure

  Define NewList lstDirs.s()
  Define NewMap mapHash()
  
Define cntDirs
Define BaseDir$ 
Define I, res, t1, t2, t3

BaseDir$ ="D:\Daten"
;BaseDir$ ="C:\Windows"

OpenConsole("List Directories in : " + BaseDir$)

t1 = ElapsedMilliseconds()
; cntDirs = ListFiles(BaseDir$, lstDirs(), "*.*",#True)
cntDirs = ListDirectories(BaseDir$, lstDirs(), "", #True)
t1 = ElapsedMilliseconds() - t1

PrintN("Time to list Directories = " + Str(t1) + "ms")
PrintN(#Null$)
PrintN(Str(cntDirs) + " Direcories found!")

t2 = ElapsedMilliseconds()
ForEach lstDirs()
  AddMapElement(mapHash(), LCase(lstDirs()))  
Next
t2 = ElapsedMilliseconds() - t2

PrintN(#Null$)
PrintN("Time to crate HashTable = " + Str(t2) + "ms")

t3 = ElapsedMilliseconds()
I = 0
ForEach lstDirs()
  res = FindMapElement(mapHash(), LCase(lstDirs()))
  If res 
    I + 1
  EndIf 
Next
t3 = ElapsedMilliseconds() - t3

PrintN(#Null$)
PrintN("Time to check if all Lcase(Dir) is in Map = " + Str(t3) + "ms")
PrintN("Matches found = " + Str(I))

PrintN(#Null$)
PrintN("Press any key to exit!")
Input()  
  
Post Reply