Read from PDF

djes · Post by **djes** » Wed Oct 26, 2016 12:47 pm

Nice topic, could be useful. Thank you all !

verleihnix · Post by **verleihnix** » Sun Jul 08, 2018 3:47 pm

Hello there,

Thank you very much, this code works for me with one exception if the file name has some characters like the german umlaute "öäü". If I load "apfel.pdf" everything is fine. When is load "äpfel.pdf" nothing gets loaded. The documents are the same except from the file name. Does somebody know a solution to this?

I am using PureBasic 5.62

I guess pdfium32.dll is unicode capable.

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-utf8,password.p-utf8)
Prototype  protoGetMetaText(document, tag.p-utf8, buffer, buflen)

Procedure Main()
 
  Protected pdf_dll =OpenLibrary(#PB_Any, "pdfium32.dll")
 
  Protected InitLibrary.protoInitLibrary = GetFunction(pdf_dll,"_FPDF_InitLibrary@0")
  Protected LoadDocument.protoLoadDocument = GetFunction(pdf_dll,"_FPDF_LoadDocument@8")
  Protected GetMetaText.protoGetMetaText = GetFunction(pdf_dll,"_FPDF_GetMetaText@16")
 
  InitLibrary()
 
  Protected pdf_doc = LoadDocument("äpfel.pdf", "")
  ;Protected pdf_doc = LoadDocument("apfel.pdf", "")
 
  Protected buflen = 1024
  Protected *buffer = AllocateMemory(buflen)
 
  Protected ret = GetMetaText(pdf_doc,"Title",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  CloseLibrary(#PB_Any)
  
EndProcedure

Main()

acreis · Post by **acreis** » Mon Jul 09, 2018 6:08 pm

First try ....

change

Code: Select all

Prototype protoLoadDocument(documentpath.p-utf8,password.p-utf8)

to

Code: Select all

Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)

verleihnix · Post by **verleihnix** » Tue Jul 10, 2018 4:25 pm

Dear acreis,

Thank you very much, this change did the trick. Now i can open "äpfel.pdf" without problems.

Is there something mentioned in the pdfium library, where can I learn about this syntax?

EDIT: found this https://www.purebasic.com/documentation ... types.html will dig more into this.

- verleihnix

acreis wrote:First try ....

change
Code: Select all
Prototype protoLoadDocument(documentpath.p-utf8,password.p-utf8)
to
Code: Select all
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)

fabulouspaul · Post by **fabulouspaul** » Wed Jul 11, 2018 2:33 pm

I just tried PDFium and found ist quite usefull!

Just one thing i dont get to work: if i try to read a PDFs height and width with

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
Prototype protoCloseDocument(document)
Prototype protoGetMetaText(document, tag.p-utf8, buffer, buflen)
Prototype protoGetFileVersion(document, *version)
Prototype protoLoadPage(document, pageindex.l)
Prototype protoGetPageCount(document)
Prototype protoGetPageHeight(page.i)
Prototype protoGetPageWidth(page.i)
Prototype protoClosePage(page.i)
Prototype protoRenderPage(hdc, page, start_x.l, start_y.l, size_x.l, size_y.l, rotate.l, flags.l)

Global pdf_dll         
Global pdf_doc        
Global page_handle 
Global buflen          
Global *buffer         
Global ret            
Global pages        
Global version.l    
Global hoehe.d     
Global breite.d     
Global img           
Global hdc           

pdf_dll = OpenLibrary(#PB_Any, "P:\Sources\PDFium\pdfium.dll")

If pdf_dll = 0
  Debug "Error loading DLL"
  End
EndIf

Global InitLibrary.protoInitLibrary        = GetFunction(pdf_dll, "_FPDF_InitLibrary@0")
Global LoadDocument.protoLoadDocument      = GetFunction(pdf_dll, "_FPDF_LoadDocument@8")
Global CloseDocument.protoCloseDocument    = GetFunction(pdf_dll, "_FPDF_CloseDocument@4")
Global GetMetaText.protoGetMetaText        = GetFunction(pdf_dll, "_FPDF_GetMetaText@16")
Global GetFileVersion.protoGetFileVersion  = GetFunction(pdf_dll, "_FPDF_GetFileVersion@8")
Global LoadPage.protoLoadPage              = GetFunction(pdf_dll, "_FPDF_LoadPage@8")
Global GetPageCount.protoGetPageCount      = GetFunction(pdf_dll, "_FPDF_GetPageCount@4")
Global GetPageHeight.protoGetPageHeight    = GetFunction(pdf_dll, "_FPDF_GetPageHeight@4")
Global GetPageWidth.protoGetPageWidth      = GetFunction(pdf_dll, "_FPDF_GetPageWidth@4")
Global ClosePage.protoClosePage            = GetFunction(pdf_dll, "_FPDF_ClosePage@4")
Global RenderPage.protoRenderPage          = GetFunction(pdf_dll, "_FPDF_RenderPage@32")  

InitLibrary()

buflen = 1024
*buffer = AllocateMemory(buflen)

; Loading PDF
pdf_doc = LoadDocument("test.pdf", "")
If pdf_doc = 0
  Debug "Error loading PDF!"
  End
EndIf

; PDF-Titel
ret = GetMetaText(pdf_doc,"Title",*buffer, buflen) 
Debug "PDF-Titel: " + PeekS(*buffer, ret)

; PDF-Author
ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
Debug "Auhtor: " + PeekS(*buffer, ret)

; PDF-Creator
ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen) 
Debug "Creator: " + PeekS(*buffer, ret)

; PDF-Version
ret = GetFileVersion(pdf_doc, @version)
Debug "PDF-Version: " + StrF(version/10, 2)

; Pages in PDF
pages = GetPageCount(pdf_doc)  
Debug "Pages: " + Str(pages)

; Put a PDF-page into an image and save it
; ... loading page
page_handle = LoadPage(pdf_doc, 0)
; ...get measures
hoehe = GetPageHeight(page_handle)
breite = GetPageWidth(page_handle)
Debug breite
Debug hoehe
; ...create image
img = CreateImage(#PB_Any, breite, hoehe, 32, RGBA(255,255,255,0))
hdc = StartDrawing(ImageOutput(img))
; ...render page to image
ret = RenderPage(hdc, page_handle, 0, 0, 768, 1024, 0, 0)
StopDrawing()
; ... saving image
SaveImage(img, "test.bmp", #PB_ImagePlugin_BMP)

ClosePage(page_handle)
CloseDocument(pdf_doc)

CloseLibrary(#PB_Any)

End

I know i have to do some converting of height and width to pixel, but the results i get from GetPageWidth() and GetPageHeight() are always the same and differ with every call of those functions.

Can someone help?

acreis · Post by **acreis** » Thu Jul 12, 2018 1:20 am

Hi

Try change

Code: Select all

Prototype.d protoGetPageHeight(page.i)

to

Code: Select all

Prototype.d protoGetPageHeight(page.i)

fabulouspaul · Post by **fabulouspaul** » Thu Jul 12, 2018 9:37 am

acreis wrote:Hi

Try change
Code: Select all
Prototype.d protoGetPageHeight(page.i)
to
Code: Select all
Prototype.d protoGetPageHeight(page.i)

Thanks acreis!
The type-identifier did the trick!

verleihnix · Post by **verleihnix** » Fri Aug 03, 2018 5:09 pm

Hello all,

I wrote this little code snippet to load a PDF file. It reads the Title, the Author, the Creator and the Text of a PDF.
Some PDF open just fine and all the text is perfectly displayed, some documents read the Title, the Author and the Creator just fine, but the text is some soft of garbeled. Please see the Debug output. It looks like the characters cannot be read or displayed.

Does somebody have an idea why this is happening?

I am using PureBasic 5.62

EDIT:
it looks like there is a Problem with the exported PDF. when I open the PDF and copy&Paste text into Notepad I get garbeled Text.
https://www.experts-exchange.com/questi ... o-pdf.html

Investigating ...
/EDIT

Kind regards Mike

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
Prototype protoGetMetaText(document, tag.p-utf8, buffer, buflen)
Prototype protoGetPageCount(document)
Prototype protoLoadPage(document, page_index)
Prototype protoLoadTextPage(textpage)
Prototype protoCloseTextPage(textpage)
Prototype protoCountChars(textpage)
Prototype protoGetText(textpage, istart, iCharCnt, *result)
Prototype protoCloseDocument(document)

Procedure Main()
 
  Protected pdf_dll =OpenLibrary(#PB_Any, "pdfium32.dll")
 
  Protected InitLibrary.protoInitLibrary = GetFunction(pdf_dll,"_FPDF_InitLibrary@0")
  Protected LoadDocument.protoLoadDocument = GetFunction(pdf_dll,"_FPDF_LoadDocument@8")
  Protected GetMetaText.protoGetMetaText = GetFunction(pdf_dll,"_FPDF_GetMetaText@16")
  Protected LoadPage.protoLoadPage = GetFunction(pdf_dll,"_FPDF_LoadPage@8")
  Protected GetText.protoGetText = GetFunction(pdf_dll,"_FPDFText_GetText@16")
  Protected LoadTextPage.protoLoadTextPage = GetFunction(pdf_dll,"_FPDFText_LoadPage@4")
  Protected CountChars.protoCountChars = GetFunction(pdf_dll,"_FPDFText_CountChars@4")

  Protected     igPdfDoc.i = 0 ;Handle of Loaded PDF File
  Protected    igPageNum.i = 0 ;Current page of current PDF document
  Protected igTotalPages.i = 0
  Protected    igPdfPage.i = 0 ;Handle of Loaded PDF Page
  
  Protected *result
  Protected sTxt.s
  Protected iPdfTextPage.i, iTotalChars.i, iBuffLen.i
  Protected sFullpage.s
  Protected sPart.s

  InitLibrary()
 
  Protected pdf_doc = LoadDocument("A_PDF.pdf", "")
 
  Protected buflen = 1024
  Protected *buffer = AllocateMemory(buflen)
 
  Protected ret = GetMetaText(pdf_doc,"Title",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
  
  igPdfPage = LoadPage(pdf_doc, 0)
  iPdfTextPage = LoadTextPage(igPdfPage)
  iTotalChars = CountChars(iPdfTextPage)
  iBuffLen = (iTotalChars * SizeOf(Character))
  *result = AllocateMemory(iBuffLen)
  
  ret = GetText(iPdfTextPage, 0, 1024, *result)
  
  Debug PeekS(*result, ret/2)
  
  CloseLibrary(#PB_Any)
  
EndProcedure

Main()

acreis · Post by **acreis** » Sat Aug 04, 2018 8:13 pm

Link to unreadable pdf?

verleihnix · Post by **verleihnix** » Sun Aug 05, 2018 8:47 am

Thank you for your help, but the PDF contains customer data, I cannot Post it here. When I get another broken PDF, I will post it here for reference.

- Mike

bmcs · Post by **bmcs** » Thu Aug 30, 2018 8:19 pm

I have been looking for something like this for a while. Unfortunately, when I try either of the examples by fabulouspaul or verleihnix InitLibrary() generates an "Invalid memory access. (write error at address 0)" error.
PB 5.70 on Win 7 Ult X64
Any thoughts or pointers would be welcome.
Best Regards
Dave

Sirius-2337 · Post by **Sirius-2337** » Sat Sep 01, 2018 3:32 pm

bmcs wrote:I have been looking for something like this for a while. Unfortunately, when I try either of the examples by fabulouspaul or verleihnix InitLibrary() generates an "Invalid memory access. (write error at address 0)" error.
PB 5.70 on Win 7 Ult X64
Any thoughts or pointers would be welcome.
Best Regards
Dave

x64 Dll has different function names:

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
Prototype protoGetMetaText(document, tag.p-utf8, buffer, buflen)
Prototype protoGetPageCount(document)
Prototype protoLoadPage(document, page_index)
Prototype protoLoadTextPage(textpage)
Prototype protoCloseTextPage(textpage)
Prototype protoCountChars(textpage)
Prototype protoGetText(textpage, istart, iCharCnt, *result)
Prototype protoCloseDocument(document)

Procedure Main()
 
  Protected pdf_dll =OpenLibrary(#PB_Any, "pdfium64.dll")
 
  Protected InitLibrary.protoInitLibrary = GetFunction(pdf_dll,"FPDF_InitLibrary")
  Protected LoadDocument.protoLoadDocument = GetFunction(pdf_dll,"FPDF_LoadDocument")
  Protected GetMetaText.protoGetMetaText = GetFunction(pdf_dll,"FPDF_GetMetaText")
  Protected LoadPage.protoLoadPage = GetFunction(pdf_dll,"FPDF_LoadPage")
  Protected GetText.protoGetText = GetFunction(pdf_dll,"FPDFText_GetText")
  Protected LoadTextPage.protoLoadTextPage = GetFunction(pdf_dll,"FPDFText_LoadPage")
  Protected CountChars.protoCountChars = GetFunction(pdf_dll,"FPDFText_CountChars")

  Protected     igPdfDoc.i = 0 ;Handle of Loaded PDF File
  Protected    igPageNum.i = 0 ;Current page of current PDF document
  Protected igTotalPages.i = 0
  Protected    igPdfPage.i = 0 ;Handle of Loaded PDF Page
 
  Protected *result
  Protected sTxt.s
  Protected iPdfTextPage.i, iTotalChars.i, iBuffLen.i
  Protected sFullpage.s
  Protected sPart.s

  InitLibrary()
 
  Protected pdf_doc = LoadDocument("test.pdf", "")
 
  Protected buflen = 1024
  Protected *buffer = AllocateMemory(buflen)
 
  Protected ret = GetMetaText(pdf_doc,"Title",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  igPdfPage = LoadPage(pdf_doc, 0)
  iPdfTextPage = LoadTextPage(igPdfPage)
  iTotalChars = CountChars(iPdfTextPage)
  iBuffLen = (iTotalChars * SizeOf(Character))
  *result = AllocateMemory(iBuffLen)
 
  ret = GetText(iPdfTextPage, 0, 1024, *result)
 
  Debug PeekS(*result, ret/2)
 
  CloseLibrary(#PB_Any)
 
EndProcedure

Main()

PureBasic Forums - English

Read from PDF

Re: Read from PDF

Re: Read from PDF - pdfium with german umlaute öäü

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF

Re: Read from PDF