Page 1 of 1

My TAR extraction routines

Posted: Wed Mar 10, 2010 4:47 pm
by PrincieD
Hi guys!

Thought you might find my TAR extraction routines useful that I use in DCC Manager (for extracting in realtime as the files are downloaded). At the moment the routines can't handle "long links" but this is a trivial matter to implement (i'll do this myself at a later date), however the routines are thread safe! :)

Code: Select all

#TARBUFFERSIZE = 4194304 ;4 meg memory Buffer For file extraction routines
#TARBLOCK = 512
#TARFILESIZEOFFSET = 124
#TARFILENAMESIZE = 100
Structure TAR_Archive
  Handle.l
  pos.l
EndStructure

Procedure TAR_Open(filename.s, *Tar.TAR_Archive)
  
  Debug "archive: "+filename
  Handle = CreateFile_(filename, #FILE_READ_DATA|#FILE_WRITE_DATA, #FILE_SHARE_READ|#FILE_SHARE_WRITE, 0, #OPEN_EXISTING, #FILE_ATTRIBUTE_NORMAL|#FILE_FLAG_SEQUENTIAL_SCAN, 0)
  If Handle <> #INVALID_HANDLE_VALUE  ; if opened file successfully
    *Tar\Handle = Handle
    ProcedureReturn #True
  EndIf
  
  ProcedureReturn 0
EndProcedure

Procedure TAR_Close(*Tar.TAR_Archive)
  CloseHandle_(*Tar\Handle)
EndProcedure

Procedure TAR_CountFiles(*Tar.TAR_Archive)
  
  pos = #TARFILESIZEOFFSET
  filecount = 0
  *fileSizeBuff = AllocateMemory(12)
  If *fileSizeBuff
    Repeat
      
      ; go to next file size offset
      If SetFilePointer_(*Tar\Handle, pos, 0, #FILE_BEGIN) = $FFFFFFFF
        Debug "INVALID_SET_FILE_POINTER"
        Break
      EndIf
  
      ; reads 12 bytes file size field and converts from octal to dec on the fly, cool huh?
      ReadFile_(*Tar\Handle, *fileSizeBuff, 11, @BytesRead, 0)
      filesize = 0
      p = 10
      For n = 1 To 11
        filesize = filesize + Val(Chr(PeekB(*fileSizeBuff+n-1)))*Pow(8,p)
        p = p - 1
      Next
  
      pos = pos + (#TARBLOCK-#TARFILESIZEOFFSET) ; move postion to next block
      If filesize <> 0
        filesize = Round(filesize/#TARBLOCK, #pb_round_up)*#TARBLOCK ; round filesize into 512 byte blocks
        pos = pos + filesize
      EndIf
      pos = pos + #TARFILESIZEOFFSET
      filecount = filecount + 1
      
    Until pos > GetFileSize_(*Tar\Handle, 0)
  
    FreeMemory(*fileSizeBuff)
    ProcedureReturn filecount
  EndIf
  
  ProcedureReturn -1
  
EndProcedure

; creates new directory structure by appending newPath to Path
Procedure TAR_CreateDirectories(Path.s, newPath.s)
  
  ; make sure path is a path
  If Mid(Path, Len(Path), 1) <> "\"
    Path = Path + "\"
  EndIf
  If FileSize(Path) <> -2
    ProcedureReturn 0
  EndIf

  Repeat
    p = FindString(newPath, "\", p+1)
    If p > 0
      newdir.s = Path+Mid(newPath, 1, p-1)
      CreateDirectory(newdir)
    EndIf
  Until p = 0 Or p = Len(newPath)
  
EndProcedure


Procedure TAR_ExtractAll(*Tar.TAR_Archive, Path.s)

  ; make sure path is a path
  If Mid(Path, Len(Path), 1) <> "\"
    Path = Path + "\"
  EndIf
  If FileSize(Path) <> -2
    ProcedureReturn 0
  EndIf
  
  ; makesure archive isn't empty
  If GetFileSize_(*Tar\Handle, 0) = 0
    ProcedureReturn 0
  EndIf
  
  *Buffer = AllocateMemory(#TARBUFFERSIZE)
  If *Buffer = 0
    ProcedureReturn 0
  EndIf
  
  *fileSizeBuff = AllocateMemory(12)
  If *fileSizeBuff = 0
    ProcedureReturn 0
  EndIf
  
  Repeat
    
    If SetFilePointer_(*Tar\Handle, pos, 0, #FILE_BEGIN) = $FFFFFFFF
      Debug "INVALID_SET_FILE_POINTER"
      Break
    EndIf
    
    ; get file name
    filename.s = Space(#TARFILENAMESIZE)
    ReadFile_(*Tar\Handle, @filename, #TARFILENAMESIZE, @BytesRead, 0)
    ReplaceString(filename, "/", "\", #PB_String_InPlace)
    Debug filename
    
    ; is the filename a directory?
    If Mid(filename, Len(filename), 1) = "\"
      ; check if it already exists
      If FileSize(Path+filename) = -1
        ; create the directory structure
        TAR_CreateDirectories(Path, filename)
      EndIf
    EndIf
    
    pos = pos + #TARFILESIZEOFFSET
    If pos >= GetFileSize_(*Tar\Handle, 0)
      Break
    EndIf
    
    ; go file size offset
    If SetFilePointer_(*Tar\Handle, pos, 0, #FILE_BEGIN) = $FFFFFFFF
      Debug "INVALID_SET_FILE_POINTER"
      Break
    EndIf
    
    ; reads 12 bytes file size field and converts from octal to dec on the fly, cool huh?
    ReadFile_(*Tar\Handle, *fileSizeBuff, 11, @BytesRead, 0)
    filesize = 0
    p = 10
    For n = 1 To 11
      filesize = filesize + Val(Chr(PeekB(*fileSizeBuff+n-1)))*Pow(8,p)
      p = p - 1
    Next
    
    pos = pos + (#TARBLOCK-#TARFILESIZEOFFSET) ; move postion to next block
    If pos >= GetFileSize_(*Tar\Handle, 0)
      Break
    EndIf
    If filesize <> 0
      
      ; calc real size of file in archive if archive is not complete
      If pos+filesize > GetFileSize_(*Tar\Handle, 0)
        actualSize = GetFileSize_(*Tar\Handle, 0)-pos
      Else
        actualSize = filesize
      EndIf
        
      ; check if the file already exists and has been extracted fully
      If FileSize(Path+filename) <> actualSize
        
        ; extract file
        newFile = CreateFile_(Path+filename, #FILE_WRITE_DATA|#FILE_READ_DATA, #FILE_SHARE_READ, 0, #OPEN_ALWAYS, #FILE_ATTRIBUTE_NORMAL|#FILE_FLAG_SEQUENTIAL_SCAN, 0)
        If newFile <> #INVALID_HANDLE_VALUE  ; if opened file successfully
          
          ; go to end of newfile
          newFileSize = GetFileSize_(newFile, 0)
          If newFileSize > 0
            If SetFilePointer_(newFile, 0, 0, #FILE_END) = $FFFFFFFF
              Debug "INVALID_SET_FILE_POINTER"
              Break
            EndIf
            If SetFilePointer_(*Tar\Handle, pos+newFileSize, 0, #FILE_BEGIN) = $FFFFFFFF
              Debug "INVALID_SET_FILE_POINTER"
              Break
            EndIf
            totalRead = newFileSize
          Else
            If SetFilePointer_(*Tar\Handle, pos, 0, #FILE_BEGIN) = $FFFFFFFF
              Debug "INVALID_SET_FILE_POINTER"
              Break
            EndIf
            totalRead = 0
          EndIf
          
          ; read archived file into mem
          Repeat
            Debug "Reading..."
            
            If filesize - totalRead > #TARBUFFERSIZE
              sizeToRead = #TARBUFFERSIZE
            Else
              sizeToRead = filesize - totalRead
            EndIf
  
            ReadFile_(*Tar\Handle, *Buffer, sizeToRead, @dataRead, 0)
            totalRead = totalRead + dataRead
            Debug "Writing "+Str(dataRead)+" bytes..."
            WriteFile_(newFile, *Buffer, dataRead, @bytesWritten, 0)
          Until totalRead = filesize Or dataRead < #TARBUFFERSIZE

          CloseHandle_(newFile)
        EndIf
      EndIf
      
      filesize = Round(filesize/#TARBLOCK, #pb_round_up)*#TARBLOCK ; round filesize into 512 byte blocks
      pos = pos + filesize
    EndIf

  Until pos >= GetFileSize_(*Tar\Handle, 0)
  
  Debug "Extraction complete."
  
  FreeMemory(*Buffer)
  FreeMemory(*fileSizeBuff)
  
  ProcedureReturn #True
  
EndProcedure
Cheers!

Chris.

Re: My TAR extraction routines

Posted: Wed Mar 10, 2010 5:07 pm
by eesau
Thanks a lot, this will come in handy :)

Re: My TAR extraction routines

Posted: Wed Mar 10, 2010 5:16 pm
by PrincieD
eesau wrote:Thanks a lot, this will come in handy :)
no worries eesau :) im happy you'll find it useful :)

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 12:32 pm
by SeregaZ
any example of using this code? i try to use this one:

Code: Select all

Define Pointer.TAR_Archive
Debug TAR_Open("C:\file.tar.bz2", @Pointer)
Debug TAR_CountFiles(@Pointer)
TAR_Close(@Pointer)
but If SetFilePointer_(*Tar\Handle, pos, 0, #FILE_BEGIN) = $FFFFFFFF is stop all function.

than i try to use this one:

Code: Select all

Define *Pointer.TAR_Archive
Debug TAR_Open("C:\file.tar.bz2", *Pointer)
Debug TAR_CountFiles(*Pointer)
TAR_Close(*Pointer)
but it says pointer is null at *Tar\Handle = Handle in TAR_Open function...

and probably my archive not almost tar archive... can you attach any small correct tar archive in this topic for my test? (in this tar must be a few files + folder and a few files inside folder)

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 5:03 pm
by infratec
@SeregaZ

tar.bz2 is not a tar file, it is a bz2 file.
When you unbzip it, than you'll get a .tar file
and this should be handled by this procedures.

Bernd

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 5:07 pm
by luis
infratec wrote: When you unbzip it, than you'll get a .tar file
I hoped this was already clear :P

http://www.purebasic.fr/english/viewtop ... 34#p429034

@seregaz you just need to find/write/port some code/link some lib/spawn some exe to uncompress the file and then use (hopefully) this one to extract from the tar. To test this code just look around with google for some .tar files and try with them or decompress the ones you already have.

http://gnuwin32.sourceforge.net/packages/bzip2.htm

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 6:18 pm
by SeregaZ
this source is great mystery for me...

so i try to understand this by study of PB examples. as i can understand my arhive first - must be unpack by BZIP2, than result of this must be unpack by TAR.

it is not my arhive file. even it is not standart arhive - at the originaly this file cant be open by bzip, because signature of file is specialy change for confusion. in originaly looks like on image, so i must to a little edit file, only after this BZIP2 can work with this file.
Image

and i want to ask question: i see some code, how to hide DLL file inside main exe file. it is possibly do with LIB file?

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 10:27 pm
by SeregaZ
i read some GnuWin32 pdf materials, not very understand it, but i try to change my code to this:

Code: Select all

Prototype.i ProtoBZ2_bzBuffToBuffDecompress(*DestBuff, *DestSize, *SourceBuff, iSourceSize, small, verbosity)
Global BZ2_bzBuffToBuffDecompress.ProtoBZ2_bzBuffToBuffDecompress

hdll = OpenLibrary(#PB_Any,"C:\bzip2.dll")
If hdll
  BZ2_bzBuffToBuffDecompress = GetFunction(hdll,"BZ2_bzBuffToBuffDecompress")
  
  If BZ2_bzBuffToBuffDecompress
    
    ; first i need to make some kind of image of file:
    FileName$ = "C:\file.tar.bz2"
    nFileIn = ReadFile(#PB_Any, FileName$)
    If nFileIn
      ; read packed data
      iSourceSize = Lof(nFileIn)  
      *SourceBuff = AllocateMemory(iSourceSize)
      If *SourceBuff
        ReadData(nFileIn, *SourceBuff, iSourceSize)
        
        *DestBuff = AllocateMemory(100000)
        If *DestBuff
          *DestSize = AllocateMemory(4)
          If *DestSize
            PokeL(*DestSize, 100000)  
        
            If BZ2_bzBuffToBuffDecompress(*DestBuff, *DestSize, *SourceBuff, iSourceSize, 0, 0) = 0
            
              If CreateFile(0, "C:\tarpak.tar")           ; we create a new text file...
                WriteData(0, *DestBuff, PeekL(*DestSize))         ; write the first 10 chars from the memory block into the file
                CloseFile(0)                         ; close the previously opened file and so store the written data
              Else
                Debug "may not create the file!"
              EndIf
            EndIf
            
            FreeMemory(*DestSize) 
            
          EndIf
          FreeMemory(*DestBuff)
        EndIf
        
        FreeMemory(*SourceBuff) 
        
      EndIf
      CloseFile(nFileIn)
      
    EndIf
    
  EndIf
  
EndIf
done :) now i must to mix bzip with tar code.

result file is nice unpack by tar code, but it is not unicode, and it made from file :( how to do this in memory? without any temporaly files?

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 11:00 pm
by Deluxe0321
Maybe this will help you:
http://www.purebasic.fr/english/viewtop ... 0&p=419000

It's tar.gz but the tar part (inside OpenTarGz(File.s) procedure) should explain how the tar format works.

Re: My TAR extraction routines

Posted: Fri Oct 25, 2013 11:19 pm
by SeregaZ
easy to say :) cant understand it... and it is for 5.20, not unicode again, work with file - not memory, and use some additional library.

this topic code not use library - only PB code - it is nice :) i want to find what part of code not work with unicode and how to change file handle to memory pointer.

Re: My TAR extraction routines

Posted: Sat Oct 26, 2013 2:15 am
by SeregaZ
as i see http://www.gamedev.net/topic/273143-tar ... r-windows/ i must to make some kind of this plan:
512b + size of file + 512 + size of file... but it cant read correctly :)

filename$ = PeekS(*DestBuff, 100, #PB_Ascii)
filecontext$ = PeekS(*DestBuff+512, -1, #PB_Ascii)

size = 0045674 1223 - what is this? after unpack size of file null.xml is 18,9 КБ (19 388 b)

TAR for teapot

Posted: Sat Oct 26, 2013 1:32 pm
by SeregaZ
TAR for teapot, like me :)
i cant understand many things, but i try to explain what it is TAR-file. tar it is container-file, where is lay all files, that was choice when you create this archive. all file is divided into blocks with 512b. at the first 512b lay all information about first file in this container - this header have this structure:
name[100];
mode[8];
uid[8];
gid[8];
size[12];
mtime[12];
chksum[8];
typeflag;
linkname[100];
magic[6];
version[2];
uname[32];
gname[32];
devmajor[8];
devminor[8];
prefix[155];
padding[12];
*gnu_longname;
*gnu_longlink;

name of file it is fisrt 100b inside container, i can get it by use PeekS(*DestBuff, 100, #PB_Ascii)

than we need to get size of file. size is starts from 124b in this file: (name - 100, + mode - 8, + uid - 8, + gid - 8 )
and length of data for size 12. this value it write some special format, and i dont know what exactly... but we can get it by use function of PrincieD:

Code: Select all

              filesize = 0
              p = 10
              For n = 1 To 11
                filesize = filesize + Val(Chr(PeekB(*DestBuff+124+n-1)))*Pow(8,p)
                p = p - 1
              Next
    
              Debug filesize
do you remember our archive have blocks 512b? so size of file can be over this 512b. so we must to know how many blocks fill by our file. in my case file is 19kb. 19k\512b = 37.10***. 37 full block and last one 38 - fill only at the beginning, another part of this 38 block will be fill by nulls. so we must to round this quantity 37.10*** of blocks to UP - 38. second file starts only from 39 block, not from middle of 37 block. for this we can see this function of PrincieD:

Code: Select all

pos = Round(filesize/512, #PB_Round_Up)*512
so we have name, size and context of file in memory. we can create file, or continue to work with them in memory.

for get second file we must repeat all operation from beginning:
get name, size and context of file, but *DestBuff must be change to:
*DestBuff + 512 (for skip first file header) + 512*38 (for skip context of first file, where 38 is quantity of blocks that take context of first file)
by my theory if name of second file is empty - it means the end of file-container. but i am not sure... because when i see on this container file in notepad - i see more data after this anticipated end of file... what it is? garbage? of some kind of data for restoration files when they damaged?
Image



now i have question: how to get length of memory? i mean not AllocateMemory, but what length of memory that was write to this AllocateMemory. i mean AllocateMemory create 100b, but i fill only 15 of them - how i can get length of used memory inside this AllocateMemory?