Page 1 of 1

Compare Files in different ways

Posted: Fri Apr 28, 2023 10:40 pm
by jacdelad
Hi,
I needed a function to compare two files (are the identical or not, regarding content) and also one to find duplicates. Don't know if anyone already posted something similar, but here is my attempt (which surely can be improved):

Code: Select all

UseMD5Fingerprint()
#FC_BufferSize = 10485760;Buffer size is 10MB by default
EnumerationBinary FileCompare_CreateFileList
  #FC_CFL_SubDirectories
EndEnumeration
Structure FC_Files
  List Files.s()
EndStructure
Structure FC_Compare
  Map SizeList.FC_Files()
  Map MD5List.FC_Files()
EndStructure
Structure FC_Result
  List Files.s()
EndStructure

Procedure CreateFileList(List MyList.s(),Directory$,Flags=#False);Flags:#FC_CFL_SubDirectories -> also list subdirectories
  Protected NewList TempList.s(),exa,tempdir$,OrigSize
  OrigSize=ListSize(MyList())
  If Right(Directory$,1)<>"\"
    Directory$+"\"
  EndIf
  AddElement(TempList())
  TempList()=Directory$
  While ListSize(TempList())
    FirstElement(TempList())
    tempdir$=TempList()
    exa=ExamineDirectory(#PB_Any,tempdir$,"*.*")
    If exa
      While NextDirectoryEntry(exa)
        Select DirectoryEntryType(exa)
          Case #PB_DirectoryEntry_Directory
            If (Flags&#FC_CFL_SubDirectories) And ReplaceString(DirectoryEntryName(exa),".","")<>""
              AddElement(TempList())
              TempList()=tempdir$+DirectoryEntryName(exa)+"\"
            EndIf
          Case #PB_DirectoryEntry_File
            AddElement(MyList())
            MyList()=tempdir$+DirectoryEntryName(exa)
        EndSelect
      Wend
      FinishDirectory(exa)
    EndIf
    FirstElement(TempList())
    DeleteElement(TempList())
  Wend
  ProcedureReturn ListSize(MyList())-OrigSize;Returns how many files were found
EndProcedure
Procedure CompareFilesSimple(File1$,File2$,BufferSize=#FC_BufferSize)
  Protected file1,file2,*Buffer1,*Buffer2,BufRead1,BufRead2,Result=#True
  If FileSize(file1$)<>FileSize(File2$)
    ProcedureReturn #False;Files are not identical (different file sizes)
  Else
    file1=ReadFile(#PB_Any,File1$,#PB_File_SharedRead)
    file2=ReadFile(#PB_Any,File2$,#PB_File_SharedRead)
    If file1 And file2
      If BufferSize<=0
        BufferSize=#FC_BufferSize
      EndIf
      *Buffer1=AllocateMemory(BufferSize,#PB_Memory_NoClear)
      *Buffer1=AllocateMemory(BufferSize,#PB_Memory_NoClear)
      If *Buffer1 And *Buffer2
        While Not Eof(file1)
          BufRead1=ReadData(file1,*Buffer1,BufferSize)
          BufRead2=ReadData(file2,*Buffer2,BufferSize)
          If BufRead1<>BufRead2 Or CompareMemory(*Buffer1,*Buffer2,BufRead1)
            Result=#False
            Break
          EndIf
        Wend
        FreeMemory(*Buffer1)
        FreeMemory(*Buffer2)
        CloseFile(file1)
        CloseFile(file2)
        ProcedureReturn Result
      Else
        If *Buffer1:FreeMemory(*Buffer1):EndIf
        If *Buffer2:FreeMemory(*Buffer2):EndIf
        ProcedureReturn #False;At least one buffer couldn't be allocated
      EndIf
    Else
      If file1:CloseFile(file1):EndIf
      If file2:CloseFile(file2):EndIf
      ProcedureReturn #False;At least one file cannot be opened
    EndIf
  EndIf
EndProcedure
Procedure CompareFileList(List MyList.s(),List ResultList.FC_Result())
  Protected Temp.FC_Compare,FileSize,FileSize$,fp$
  
  ForEach MyList()
    FileSize=FileSize(MyList())
    If FileSize>=0
      AddElement(Temp\SizeList(FileSize$)\Files())
      Temp\SizeList(FileSize$)\Files()=MyList()
    EndIf
  Next
  
  ForEach Temp\SizeList()
    If ListSize(Temp\SizeList()\Files())>1
      If MapKey(Temp\SizeList())="0"
        AddElement(ResultList())
        ForEach Temp\SizeList()\Files()
          AddElement(ResultList()\Files())
          ResultList()\Files()=Temp\SizeList()\Files()
        Next
      Else
        ForEach Temp\SizeList()\Files()
          fp$=FileFingerprint(Temp\SizeList()\Files(),#PB_Cipher_MD5)
          If fp$<>""
            AddElement(Temp\MD5List(fp$)\Files())
            Temp\MD5List(fp$)\Files()=Temp\SizeList()\Files()
          EndIf
        Next
      EndIf
    EndIf
  Next
  
  ForEach Temp\MD5List()
    If ListSize(Temp\MD5List()\Files())>1
      AddElement(ResultList())
      ForEach Temp\MD5List()\Files()
        AddElement(ResultList()\Files())
        ResultList()\Files()=Temp\MD5List()\Files()
      Next
    EndIf
  Next
  
EndProcedure

Define Dir$=PathRequester("Choose Directory","")
If dir$<>""
  Define NewList FileList.s(),NewList ResultList.FC_Result()
  CreateFileList(FileList(),dir$,#FC_CFL_SubDirectories)
  CompareFileList(FileList(),ResultList())
  
  Debug Str(ListSize(ResultList()))+" group(s) found"
  ForEach ResultList()
    Debug "Group "+Str(ListIndex(ResultList()))+":"
    ForEach ResultList()\Files()
      Debug ResultList()\Files()
    Next
  Next
EndIf
I'm planning to cut the files into blocks for the duplicate finder and stop if the blocks don't match (=not the whole file has to be hashed), but I haven't done this yet.

Re: Compare Files in different ways

Posted: Sat Apr 29, 2023 4:46 am
by AZJIO
viewtopic.php?t=79382
It is enough to compare the files by size in order to first understand that they are different. Then by MD5 and if they match, then by content. Although in most cases MD5 is enough.
For large files MD5 will be slow. I made pre-request bytes at intervals like 100 requests per file. It is necessary to divide 2 GB by 100 and get the size of how much to skip to make a byte request. Then I compare the result. This is a preliminary assessment.

If the file is more than 50,000 bytes, then I use a pseudo hash, it's faster.
Shift=FileSize / 31

Code: Select all

Procedure.s GetPseudoHash(Path$, Shift.q)
	Protected res$, length, file_id
	file_id = ReadFile(#PB_Any, Path$)
	If file_id
		length = Lof(file_id)
		FileSeek(file_id, 4, #PB_Relative)
; 		res$ = Hex(ReadByte(file_id), #PB_Byte) + " "
	    While Eof(file_id) = 0
	        res$ + Hex(ReadByte(file_id), #PB_Byte)
			FileSeek(file_id, Shift, #PB_Relative)
	    Wend
		FileSeek(file_id, length - 1, #PB_Absolute)
	     res$ + Hex(ReadByte(file_id), #PB_Byte)
	    CloseFile(file_id)
	EndIf
	ProcedureReturn res$
EndProcedure

Re: Compare Files in different ways

Posted: Sun Apr 30, 2023 12:26 am
by jacdelad
Erm...
that's what I do. All functions first check filesize. For the direct comparison of two files, the files are partially hashed until a difference is found or not.
When comparing whole folders I create a hash for the whole file since this is easier to compare between multiple files. But I'm already working a version that creates partially hashe, like you suggested. Unfortunately this makes comparing files a bit more complicated and more work for me.
I'm also working on an "engine" for a backup program, that's why I need the file comparison stuff.

Re: Compare Files in different ways

Posted: Sun Apr 30, 2023 5:54 am
by AZJIO
jacdelad wrote: Sun Apr 30, 2023 12:26 am Unfortunately this makes comparing files a bit more complicated and more work for me.
You are adding a size to the "Map" while adding the path to the list. You end up with an element with the same size and multiple paths in the list. If the "Map" element contains a list with one file, then delete the element, and if there are 2 or more, then you run a pseudo-cache check for each and again delete the single ones and get a similar structure, and then the same for MD5. These are the same type of operations, only in one case with the file size, in the other with the hash. You have 3 operations of the same type, as if it were a cycle of 3 steps.

The Size.q and md5.s elements in the structure are optional, but I use it because I work with a CSV list, that is, I can compare duplicates with a list whose files are not available to me or take time to get MD5. It is also needed as information for displaying results.

Code: Select all

Structure DataFSL
	Size.q
	md5.s
	List Lst1.Lst1()
EndStructure

Re: Compare Files in different ways

Posted: Tue May 02, 2023 2:40 pm
by jacdelad
Hi AZJIO,
this is a great idea. Until now I only thought of comparing two files directly to each other, but putting it into steps and thinning out the list is a great idea! Hope I have time this week.
Thanks very much for the suggestion!

Re: Compare Files in different ways

Posted: Tue May 02, 2023 6:53 pm
by StarBootics
Hello,

I don't know if the following code will help you or not but, I will let you decide.

Code: Select all

; <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
; Project name : CompareFiles
; File Name : CompareFiles - Module.pb
; File version: 1.0.1
; Programming : OK
; Programmed by : StarBootics
; Date : May 2nd, 2023
; Last Update : May 2nd, 2023
; PureBasic code : V6.02 beta 2 LTS
; Platform : Windows, Linux, MacOS X
; <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

; <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
; Programming notes
;
; Based on Wilbert's original code.
; https://www.purebasic.fr/english/viewtopic.php?f=13&t=73840
;
; <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

DeclareModule CompareFiles
  
  Declare.i AreTheyTheSame(FileName00.s, FileName01.s)
  
EndDeclareModule

Module CompareFiles
 
  Procedure.i AreTheyTheSame(FileName00.s, FileName01.s)
    
    ; Return 0 if the files are different and 1 if the files are the same
    
    Protected Equal.i, File1.i, File2.i, Read1.i, Read2.i, Remaining.q, *Buffer
    
    *Buffer = AllocateMemory(65536, #PB_Memory_NoClear)
    
    If *Buffer
      
      File1 = ReadFile(#PB_Any, Filename00)
      
      If File1
        
        File2 = ReadFile(#PB_Any, Filename01)
        
        If File2
          
          Remaining = Lof(File1)
          
          If Lof(File2) = Remaining
            
            Repeat
              
              Read1 = ReadData(File1, *Buffer, 32768)
              Read2 = ReadData(File2, *Buffer + 32768, 32768)
              
              If Read1 = Read2 And CompareMemory(*Buffer, *Buffer + 32768, Read1)
                
                Remaining - Read1
                
                If Remaining = 0
                  Equal.i = #True
                  Break
                EndIf
                
              Else
                
                Break
                
              EndIf
              
            ForEver
            
          EndIf
          
          CloseFile(File2)
          
        EndIf
        
        CloseFile(File1)
        
      EndIf
      
      FreeMemory(*Buffer)
      
    EndIf
    
    ProcedureReturn Equal
  EndProcedure
  
EndModule

; <<<<<<<<<<<<<<<<<<<<<<<
; <<<<< END OF FILE <<<<<
; <<<<<<<<<<<<<<<<<<<<<<<
Best regards
StarBootics

Re: Compare Files in different ways

Posted: Tue May 02, 2023 7:00 pm
by jacdelad
Hello StarBootics,
thanks for the code. This is a slightly improved version of the simple comparison function I posted above. Yours is reading segments and comparing them, while my one reads and compares the whole files in one run. Good idea!