Outlook MSG to Text decoder

Share your advanced PureBasic knowledge/code with the community.
dige
Addict
Addict
Posts: 1396
Joined: Wed Apr 30, 2003 8:15 am
Location: Germany
Contact:

Outlook MSG to Text decoder

Post by dige »

I tried it out just for fun and now I'm deeply shocked. ChatGPT 5 can program Purebasic and created an Outlook Msg to Text converter for me.
The whole thing worked out of the box. That's crazy. :shock: :D

Code: Select all

; ================== MSG -> Header + Body ==================
; initiated-by: Dige
; implemented-by: ChatGPT

EnableExplicit

; --- STGM ---
#STGM_READ             = $00000000
#STGM_SHARE_DENY_WRITE = $00000020
#STGM_SHARE_EXCLUSIVE  = $00000010

; --- MSG-Streams (Body) ---
#STREAM_BODY_UNICODE = "__substg1.0_1000001F"
#STREAM_BODY_ANSI    = "__substg1.0_1000001E"
#STREAM_HTML_UNICODE = "__substg1.0_1013001F"
#STREAM_HTML_ANSI    = "__substg1.0_1013001E"
#STREAM_RTF_COMP     = "__substg1.0_10090102"

; --- MSG-Streams (Header/Felder) ---
#STREAM_HEADERS_UNICODE = "__substg1.0_007D001F" ; PR_TRANSPORT_MESSAGE_HEADERS
#STREAM_HEADERS_ANSI    = "__substg1.0_007D001E"

#STREAM_SUBJECT_UNICODE = "__substg1.0_0037001F" ; PR_SUBJECT
#STREAM_SUBJECT_ANSI    = "__substg1.0_0037001E"

#STREAM_DISPLAY_TO_U    = "__substg1.0_0E04001F" ; PR_DISPLAY_TO
#STREAM_DISPLAY_TO_A    = "__substg1.0_0E04001E"

#STREAM_SENDER_NAME_U   = "__substg1.0_0C1A001F" ; PR_SENDER_NAME
#STREAM_SENDER_NAME_A   = "__substg1.0_0C1A001E"
#STREAM_SENDER_ADDR_U   = "__substg1.0_0C1F001F" ; PR_SENDER_EMAIL_ADDRESS
#STREAM_SENDER_ADDR_A   = "__substg1.0_0C1F001E"
#STREAM_SENTREP_ADDR_U  = "__substg1.0_0065001F" ; PR_SENT_REPRESENTING_EMAIL_ADDRESS
#STREAM_SENTREP_ADDR_A  = "__substg1.0_0065001E"

#STREAM_CLIENT_SUBMIT   = "__substg1.0_00390040" ; PR_CLIENT_SUBMIT_TIME (FILETIME)
#STREAM_DELIVERY_TIME   = "__substg1.0_0E060040" ; PR_MESSAGE_DELIVERY_TIME (FILETIME)

; --- COM-Interfaces (Minimal) ---

ImportC "ole32.lib"
  CoInitializeEx(*pvReserved, dwCoInit.l)
  CoUninitialize()
  StgOpenStorage(*pwcsName, pStgPriority, grfMode.l, pSnB, reserved.l, *ppstg.IStorage)
EndImport

; ---------- Helpers ----------
Procedure.i WideAlloc(str.s)
  Protected *p = AllocateMemory((Len(str)+1)*SizeOf(Character))
  If *p : PokeS(*p, str, -1, #PB_Unicode) : EndIf
  ProcedureReturn *p
EndProcedure

Procedure.s ReadAllFromIStream_Text(*stm.IStream, isUnicode.i, isAnsi.i)
  Protected chunk = 8192, *buf = AllocateMemory(chunk)
  If *buf = 0 : ProcedureReturn "" : EndIf
  Protected *mem = #Null, total = 0, bytesRead.l, ok = #True
  
  While ok
    bytesRead = 0
    If *stm\Read(*buf, chunk, @bytesRead) <> 0 : ok = #False : Break : EndIf
    If bytesRead = 0 : Break : EndIf
    If *mem = #Null
      *mem = AllocateMemory(bytesRead) : If *mem = 0 : ok = #False : Break : EndIf
      CopyMemory(*buf, *mem, bytesRead) : total = bytesRead
    Else
      Protected *new = ReAllocateMemory(*mem, total + bytesRead)
      If *new = 0 : ok = #False : Break : EndIf
      *mem = *new : CopyMemory(*buf, *mem + total, bytesRead) : total + bytesRead
    EndIf
    If bytesRead < chunk : Break : EndIf
  Wend
  
  FreeMemory(*buf)
  If ok = #False Or total = 0
    If *mem : FreeMemory(*mem) : EndIf
    ProcedureReturn ""
  EndIf
  
  Protected out.s
  If isUnicode
    out = PeekS(*mem, total/SizeOf(Character), #PB_Unicode)
  ElseIf isAnsi
    out = PeekS(*mem, total, #PB_Ascii)
  Else
    out = ""
  EndIf
  FreeMemory(*mem)
  ProcedureReturn out
EndProcedure

Procedure.i ReadAllFromIStream_Binary(*stm.IStream, *outSize.Integer)
  Protected chunk = 4096, *buf = AllocateMemory(chunk)
  If *buf = 0 : ProcedureReturn #Null : EndIf
  Protected *mem = #Null, total = 0, bytesRead.l, ok = #True
  
  While ok
    bytesRead = 0
    If *stm\Read(*buf, chunk, @bytesRead) <> 0 : ok = #False : Break : EndIf
    If bytesRead = 0 : Break : EndIf
    If *mem = #Null
      *mem = AllocateMemory(bytesRead) : If *mem = 0 : ok = #False : Break : EndIf
      CopyMemory(*buf, *mem, bytesRead) : total = bytesRead
    Else
      Protected *new = ReAllocateMemory(*mem, total + bytesRead)
      If *new = 0 : ok = #False : Break : EndIf
      *mem = *new : CopyMemory(*buf, *mem + total, bytesRead) : total + bytesRead
    EndIf
    If bytesRead < chunk : Break : EndIf
  Wend
  
  FreeMemory(*buf)
  If ok = #False Or total = 0
    If *mem : FreeMemory(*mem) : EndIf
    ProcedureReturn #Null
  EndIf
  *outSize\i = total
  ProcedureReturn *mem
EndProcedure

Procedure.s TryOpenStreamText(*stg.IStorage, name.s, isUnicode.i, isAnsi.i)
  Protected *stm.IStream, txt.s, *w = WideAlloc(name)
  If *w = 0 : ProcedureReturn "" : EndIf
  If *stg\OpenStream(*w, 0, #STGM_READ|#STGM_SHARE_EXCLUSIVE, 0, @*stm) = 0 And *stm
    txt = ReadAllFromIStream_Text(*stm, isUnicode, isAnsi)
    *stm\Release()
  EndIf
  FreeMemory(*w)
  ProcedureReturn txt
EndProcedure

Procedure.i TryOpenStreamBinary(*stg.IStorage, name.s, *outSize.Integer)
  Protected *stm.IStream, *data = #Null, *w = WideAlloc(name)
  If *w = 0 : ProcedureReturn #Null : EndIf
  If *stg\OpenStream(*w, 0, #STGM_READ|#STGM_SHARE_EXCLUSIVE, 0, @*stm) = 0 And *stm
    *data = ReadAllFromIStream_Binary(*stm, *outSize)
    *stm\Release()
  EndIf
  FreeMemory(*w)
  ProcedureReturn *data
EndProcedure

Procedure.s StripHtml(html.s)
  Protected out.s = html, s, e, pos
  out = ReplaceString(out, Chr(13), " ")
  out = ReplaceString(out, Chr(10), " ")
  While FindString(LCase(out), "<script", 1)
    s = FindString(LCase(out), "<script", 1)
    e = FindString(LCase(out), "</script>", s+1) : If e=0:Break:EndIf
    out = Left(out, s-1) + Mid(out, e+9)
  Wend
  While FindString(LCase(out), "<style", 1)
    s = FindString(LCase(out), "<style", 1)
    e = FindString(LCase(out), "</style>", s+1) : If e=0:Break:EndIf
    out = Left(out, s-1) + Mid(out, e+8)
  Wend
  pos = 1
  While pos
    pos = FindString(out, "<", 1)
    If pos
      e = FindString(out, ">", pos+1) : If e=0:Break:EndIf
      out = Left(out, pos-1) + Mid(out, e+1)
    EndIf
  Wend
  out = ReplaceString(out, "&nbsp;", " ")
  out = ReplaceString(out, "&amp;", "&")
  out = ReplaceString(out, "&lt;", "<")
  out = ReplaceString(out, "&gt;", ">")
  ProcedureReturn Trim(out)
EndProcedure

; ---- RFC822 Headerblock parsen (gefaltete Zeilen unterstützt) ----
Procedure.s GetHeaderValue(headers.s, key.s)
  Protected lines.s = headers, line.s, i, n, k.s = LCase(key + ":")
  Protected result.s
  n = CountString(lines, #CRLF$) + 1
  Dim arr.s(n-1)
  For i=0 To n-1
    arr(i) = StringField(lines, i+1, #CRLF$)
  Next
  
  For i=0 To n-1
    line = arr(i)
    If LCase(Left(Trim(line), Len(k))) = k
      result = Mid(line, Len(key)+2)
      ; Fortsetzungzeilen aufsammeln (beginnen mit Space/Tab)
      While i+1 <= n-1 And (Left(arr(i+1),1) = " " Or Left(arr(i+1),1) = #TAB$)
        result + " " + Trim(arr(i+1))
        i + 1
      Wend
      ProcedureReturn Trim(result)
    EndIf
  Next
  ProcedureReturn ""
EndProcedure

; ---- FILETIME (8 Bytes) -> ISO-String ----
Procedure.s FileTimeToIso8601(*bin, size.i)
  If size < 8 : ProcedureReturn "" : EndIf
  Protected.q ft = PeekQ(*bin)
  If ft = 0 : ProcedureReturn "" : EndIf
  Protected.d secs = ft / 10000000.0
  Protected.d offset = 11644473600.0 ; 1601->1970
  Protected.i unix = Int(secs - offset)
  If unix <= 0 : ProcedureReturn "" : EndIf
  ProcedureReturn FormatDate("%yyyy-%mm-%dd %hh:%ii:%ss", unix)
EndProcedure

; ------------------------- API -------------------------
; ExtractMsgWithHeader(path$, stripHtmlIfNeeded=#True)
; Rückgabe:
;   From: ...
;   To: ...
;   Subject: ...
;   Date: ...
;
;   <Body>
Procedure.s ExtractMsgWithHeader(path.s, stripHtmlIfNeeded.i = #True)
  Protected body.s, html.s, headersRaw.s
  Protected from.s, rcpt.s, subj.s, dateHdr.s, dateProp.s, hDate.s
  Protected *stg.IStorage
  If path = "" Or FileSize(path) <= 0 : ProcedureReturn "" : EndIf
  
  CoInitializeEx(#Null, 0)
  Protected *wPath = WideAlloc(path) : If *wPath = 0 : CoUninitialize() : ProcedureReturn "" : EndIf
  
  If StgOpenStorage(*wPath, #Null, #STGM_READ|#STGM_SHARE_DENY_WRITE, #Null, 0, @*stg) <> 0 Or *stg = 0
    FreeMemory(*wPath) : CoUninitialize() : ProcedureReturn ""
  EndIf
  FreeMemory(*wPath)
  
  ; ---- Body: Plain > HTML > RTF-Hinweis
  body = TryOpenStreamText(*stg, #STREAM_BODY_UNICODE, #True, #False)
  If body = "" : body = TryOpenStreamText(*stg, #STREAM_BODY_ANSI, #False, #True) : EndIf
  If body = ""
    html = TryOpenStreamText(*stg, #STREAM_HTML_UNICODE, #True, #False)
    If html = "" : html = TryOpenStreamText(*stg, #STREAM_HTML_ANSI, #False, #True) : EndIf
    If html <> ""
      If stripHtmlIfNeeded
        body = StripHtml(html)
      Else
        body = html
      EndIf
    EndIf
  EndIf
  If body = ""
    If TryOpenStreamText(*stg, #STREAM_RTF_COMP, #False, #False) <> ""
      body = "[Hinweis] Nur RTF-komprimierter Body gefunden (10090102). RTF-Dekompression nötig."
    EndIf
  EndIf
  
  ; ---- Headerblock
  headersRaw = TryOpenStreamText(*stg, #STREAM_HEADERS_UNICODE, #True, #False)
  If headersRaw = "" : headersRaw = TryOpenStreamText(*stg, #STREAM_HEADERS_ANSI, #False, #True) : EndIf
  If headersRaw <> ""
    from    = GetHeaderValue(headersRaw, "From")
    rcpt    = GetHeaderValue(headersRaw, "To")
    subj    = GetHeaderValue(headersRaw, "Subject")
    dateHdr = GetHeaderValue(headersRaw, "Date")
  EndIf
  
  ; ---- Fallbacks
  If subj = "" : subj = TryOpenStreamText(*stg, #STREAM_SUBJECT_UNICODE, #True, #False) : EndIf
  If subj = "" : subj = TryOpenStreamText(*stg, #STREAM_SUBJECT_ANSI,    #False, #True) : EndIf
  
  If rcpt = "" : rcpt = TryOpenStreamText(*stg, #STREAM_DISPLAY_TO_U, #True, #False) : EndIf
  If rcpt = "" : rcpt = TryOpenStreamText(*stg, #STREAM_DISPLAY_TO_A, #False, #True) : EndIf
  
  If from = ""
    from = TryOpenStreamText(*stg, #STREAM_SENDER_ADDR_U, #True, #False)
    If from = "" : from = TryOpenStreamText(*stg, #STREAM_SENDER_ADDR_A, #False, #True) : EndIf
    If from = ""
      from = TryOpenStreamText(*stg, #STREAM_SENDER_NAME_U, #True, #False)
      If from = "" : from = TryOpenStreamText(*stg, #STREAM_SENDER_NAME_A, #False, #True) : EndIf
    EndIf
    If from = ""
      from = TryOpenStreamText(*stg, #STREAM_SENTREP_ADDR_U, #True, #False)
      If from = "" : from = TryOpenStreamText(*stg, #STREAM_SENTREP_ADDR_A, #False, #True) : EndIf
    EndIf
  EndIf
  
  If dateHdr = ""
    Protected sz.Integer, *bin
    *bin = TryOpenStreamBinary(*stg, #STREAM_CLIENT_SUBMIT, @sz)
    If *bin
      dateProp = FileTimeToIso8601(*bin, sz\i) : FreeMemory(*bin)
    Else
      *bin = TryOpenStreamBinary(*stg, #STREAM_DELIVERY_TIME, @sz)
      If *bin
        dateProp = FileTimeToIso8601(*bin, sz\i) : FreeMemory(*bin)
      EndIf
    EndIf
  EndIf
  
  *stg\Release()
  CoUninitialize()
  
  ; ---- Date-Zeile ohne IIf
  If dateHdr <> ""
    hDate = "Date: " + dateHdr
  Else
    hDate = "Date: " + dateProp
  EndIf
  
  ProcedureReturn Trim("From: " + from + #CRLF$ +
                       "To: " + rcpt + #CRLF$ +
                       "Subject: " + subj + #CRLF$ +
                       hDate + #CRLF$ + #CRLF$ +
                       body)
EndProcedure

; -------------------- Beispiel --------------------
Debug ExtractMsgWithHeader("P:\outlook.msg")

"Daddy, I'll run faster, then it is not so far..."
BarryG
Addict
Addict
Posts: 4155
Joined: Thu Apr 18, 2019 8:17 am

Re: Outlook MSG to Text decoder

Post by BarryG »

Yep. Some people here mocked ChatGPT not long ago but it really can do some good error-free stuff now. I used it last week to learn how change the volume of an app (by window), and it worked immediately. I'm amazed.
Post Reply