Read from PDF

Just starting out? Need help? Post your questions and find answers here.
User avatar
djes
Addict
Addict
Posts: 1806
Joined: Sat Feb 19, 2005 2:46 pm
Location: Pas-de-Calais, France

Re: Read from PDF

Post by djes »

Nice topic, could be useful. Thank you all !
verleihnix
User
User
Posts: 13
Joined: Sun Dec 14, 2008 4:55 am
Location: Switzerland

Re: Read from PDF - pdfium with german umlaute öäü

Post by verleihnix »

Hello there,

Thank you very much, this code works for me with one exception if the file name has some characters like the german umlaute "öäü". If I load "apfel.pdf" everything is fine. When is load "äpfel.pdf" nothing gets loaded. The documents are the same except from the file name. Does somebody know a solution to this?

I am using PureBasic 5.62

I guess pdfium32.dll is unicode capable.

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-utf8,password.p-utf8)
Prototype  protoGetMetaText(document, tag.p-utf8, buffer, buflen)

Procedure Main()
 
  Protected pdf_dll =OpenLibrary(#PB_Any, "pdfium32.dll")
 
  Protected InitLibrary.protoInitLibrary = GetFunction(pdf_dll,"_FPDF_InitLibrary@0")
  Protected LoadDocument.protoLoadDocument = GetFunction(pdf_dll,"_FPDF_LoadDocument@8")
  Protected GetMetaText.protoGetMetaText = GetFunction(pdf_dll,"_FPDF_GetMetaText@16")
 
  InitLibrary()
 
  Protected pdf_doc = LoadDocument("äpfel.pdf", "")
  ;Protected pdf_doc = LoadDocument("apfel.pdf", "")
 
  Protected buflen = 1024
  Protected *buffer = AllocateMemory(buflen)
 
  Protected ret = GetMetaText(pdf_doc,"Title",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  CloseLibrary(#PB_Any)
  
EndProcedure

Main()

acreis
Enthusiast
Enthusiast
Posts: 182
Joined: Fri Jun 01, 2012 12:20 am

Re: Read from PDF

Post by acreis »

First try ....

change

Code: Select all

Prototype protoLoadDocument(documentpath.p-utf8,password.p-utf8)
to

Code: Select all

Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
verleihnix
User
User
Posts: 13
Joined: Sun Dec 14, 2008 4:55 am
Location: Switzerland

Re: Read from PDF

Post by verleihnix »

Dear acreis,

Thank you very much, this change did the trick. Now i can open "äpfel.pdf" without problems.

Is there something mentioned in the pdfium library, where can I learn about this syntax?

EDIT: found this https://www.purebasic.com/documentation ... types.html will dig more into this.

- verleihnix
acreis wrote:First try ....

change

Code: Select all

Prototype protoLoadDocument(documentpath.p-utf8,password.p-utf8)
to

Code: Select all

Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
fabulouspaul
User
User
Posts: 34
Joined: Sun Nov 23, 2014 1:18 pm

Re: Read from PDF

Post by fabulouspaul »

I just tried PDFium and found ist quite usefull!

Just one thing i dont get to work: if i try to read a PDFs height and width with

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
Prototype protoCloseDocument(document)
Prototype protoGetMetaText(document, tag.p-utf8, buffer, buflen)
Prototype protoGetFileVersion(document, *version)
Prototype protoLoadPage(document, pageindex.l)
Prototype protoGetPageCount(document)
Prototype protoGetPageHeight(page.i)
Prototype protoGetPageWidth(page.i)
Prototype protoClosePage(page.i)
Prototype protoRenderPage(hdc, page, start_x.l, start_y.l, size_x.l, size_y.l, rotate.l, flags.l)

Global pdf_dll         
Global pdf_doc        
Global page_handle 
Global buflen          
Global *buffer         
Global ret            
Global pages        
Global version.l    
Global hoehe.d     
Global breite.d     
Global img           
Global hdc           

pdf_dll = OpenLibrary(#PB_Any, "P:\Sources\PDFium\pdfium.dll")

If pdf_dll = 0
  Debug "Error loading DLL"
  End
EndIf

Global InitLibrary.protoInitLibrary        = GetFunction(pdf_dll, "_FPDF_InitLibrary@0")
Global LoadDocument.protoLoadDocument      = GetFunction(pdf_dll, "_FPDF_LoadDocument@8")
Global CloseDocument.protoCloseDocument    = GetFunction(pdf_dll, "_FPDF_CloseDocument@4")
Global GetMetaText.protoGetMetaText        = GetFunction(pdf_dll, "_FPDF_GetMetaText@16")
Global GetFileVersion.protoGetFileVersion  = GetFunction(pdf_dll, "_FPDF_GetFileVersion@8")
Global LoadPage.protoLoadPage              = GetFunction(pdf_dll, "_FPDF_LoadPage@8")
Global GetPageCount.protoGetPageCount      = GetFunction(pdf_dll, "_FPDF_GetPageCount@4")
Global GetPageHeight.protoGetPageHeight    = GetFunction(pdf_dll, "_FPDF_GetPageHeight@4")
Global GetPageWidth.protoGetPageWidth      = GetFunction(pdf_dll, "_FPDF_GetPageWidth@4")
Global ClosePage.protoClosePage            = GetFunction(pdf_dll, "_FPDF_ClosePage@4")
Global RenderPage.protoRenderPage          = GetFunction(pdf_dll, "_FPDF_RenderPage@32")  

InitLibrary()

buflen = 1024
*buffer = AllocateMemory(buflen)

; Loading PDF
pdf_doc = LoadDocument("test.pdf", "")
If pdf_doc = 0
  Debug "Error loading PDF!"
  End
EndIf

; PDF-Titel
ret = GetMetaText(pdf_doc,"Title",*buffer, buflen) 
Debug "PDF-Titel: " + PeekS(*buffer, ret)

; PDF-Author
ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
Debug "Auhtor: " + PeekS(*buffer, ret)

; PDF-Creator
ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen) 
Debug "Creator: " + PeekS(*buffer, ret)

; PDF-Version
ret = GetFileVersion(pdf_doc, @version)
Debug "PDF-Version: " + StrF(version/10, 2)

; Pages in PDF
pages = GetPageCount(pdf_doc)  
Debug "Pages: " + Str(pages)

; Put a PDF-page into an image and save it
; ... loading page
page_handle = LoadPage(pdf_doc, 0)
; ...get measures
hoehe = GetPageHeight(page_handle)
breite = GetPageWidth(page_handle)
Debug breite
Debug hoehe
; ...create image
img = CreateImage(#PB_Any, breite, hoehe, 32, RGBA(255,255,255,0))
hdc = StartDrawing(ImageOutput(img))
; ...render page to image
ret = RenderPage(hdc, page_handle, 0, 0, 768, 1024, 0, 0)
StopDrawing()
; ... saving image
SaveImage(img, "test.bmp", #PB_ImagePlugin_BMP)

ClosePage(page_handle)
CloseDocument(pdf_doc)

CloseLibrary(#PB_Any)

End
I know i have to do some converting of height and width to pixel, but the results i get from GetPageWidth() and GetPageHeight() are always the same and differ with every call of those functions.

Can someone help?
acreis
Enthusiast
Enthusiast
Posts: 182
Joined: Fri Jun 01, 2012 12:20 am

Re: Read from PDF

Post by acreis »

Hi

Try change

Code: Select all

Prototype.d protoGetPageHeight(page.i)
to

Code: Select all

Prototype.d protoGetPageHeight(page.i)
fabulouspaul
User
User
Posts: 34
Joined: Sun Nov 23, 2014 1:18 pm

Re: Read from PDF

Post by fabulouspaul »

acreis wrote:Hi

Try change

Code: Select all

Prototype.d protoGetPageHeight(page.i)
to

Code: Select all

Prototype.d protoGetPageHeight(page.i)
Thanks acreis!
The type-identifier did the trick! :D
verleihnix
User
User
Posts: 13
Joined: Sun Dec 14, 2008 4:55 am
Location: Switzerland

Re: Read from PDF

Post by verleihnix »

Hello all,

I wrote this little code snippet to load a PDF file. It reads the Title, the Author, the Creator and the Text of a PDF.
Some PDF open just fine and all the text is perfectly displayed, some documents read the Title, the Author and the Creator just fine, but the text is some soft of garbeled. Please see the Debug output. It looks like the characters cannot be read or displayed.

Does somebody have an idea why this is happening?

Image

I am using PureBasic 5.62

EDIT:
it looks like there is a Problem with the exported PDF. when I open the PDF and copy&Paste text into Notepad I get garbeled Text.
https://www.experts-exchange.com/questi ... o-pdf.html

Investigating ...
/EDIT

Kind regards Mike

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
Prototype protoGetMetaText(document, tag.p-utf8, buffer, buflen)
Prototype protoGetPageCount(document)
Prototype protoLoadPage(document, page_index)
Prototype protoLoadTextPage(textpage)
Prototype protoCloseTextPage(textpage)
Prototype protoCountChars(textpage)
Prototype protoGetText(textpage, istart, iCharCnt, *result)
Prototype protoCloseDocument(document)

Procedure Main()
 
  Protected pdf_dll =OpenLibrary(#PB_Any, "pdfium32.dll")
 
  Protected InitLibrary.protoInitLibrary = GetFunction(pdf_dll,"_FPDF_InitLibrary@0")
  Protected LoadDocument.protoLoadDocument = GetFunction(pdf_dll,"_FPDF_LoadDocument@8")
  Protected GetMetaText.protoGetMetaText = GetFunction(pdf_dll,"_FPDF_GetMetaText@16")
  Protected LoadPage.protoLoadPage = GetFunction(pdf_dll,"_FPDF_LoadPage@8")
  Protected GetText.protoGetText = GetFunction(pdf_dll,"_FPDFText_GetText@16")
  Protected LoadTextPage.protoLoadTextPage = GetFunction(pdf_dll,"_FPDFText_LoadPage@4")
  Protected CountChars.protoCountChars = GetFunction(pdf_dll,"_FPDFText_CountChars@4")

  Protected     igPdfDoc.i = 0 ;Handle of Loaded PDF File
  Protected    igPageNum.i = 0 ;Current page of current PDF document
  Protected igTotalPages.i = 0
  Protected    igPdfPage.i = 0 ;Handle of Loaded PDF Page
  
  Protected *result
  Protected sTxt.s
  Protected iPdfTextPage.i, iTotalChars.i, iBuffLen.i
  Protected sFullpage.s
  Protected sPart.s

  InitLibrary()
 
  Protected pdf_doc = LoadDocument("A_PDF.pdf", "")
 
  Protected buflen = 1024
  Protected *buffer = AllocateMemory(buflen)
 
  Protected ret = GetMetaText(pdf_doc,"Title",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
  
  igPdfPage = LoadPage(pdf_doc, 0)
  iPdfTextPage = LoadTextPage(igPdfPage)
  iTotalChars = CountChars(iPdfTextPage)
  iBuffLen = (iTotalChars * SizeOf(Character))
  *result = AllocateMemory(iBuffLen)
  
  ret = GetText(iPdfTextPage, 0, 1024, *result)
  
  Debug PeekS(*result, ret/2)
  
  CloseLibrary(#PB_Any)
  
EndProcedure

Main()
acreis
Enthusiast
Enthusiast
Posts: 182
Joined: Fri Jun 01, 2012 12:20 am

Re: Read from PDF

Post by acreis »

Link to unreadable pdf?
verleihnix
User
User
Posts: 13
Joined: Sun Dec 14, 2008 4:55 am
Location: Switzerland

Re: Read from PDF

Post by verleihnix »

Thank you for your help, but the PDF contains customer data, I cannot Post it here. When I get another broken PDF, I will post it here for reference.

- Mike
bmcs
User
User
Posts: 21
Joined: Sat Apr 01, 2017 12:47 pm

Re: Read from PDF

Post by bmcs »

I have been looking for something like this for a while. Unfortunately, when I try either of the examples by fabulouspaul or verleihnix InitLibrary() generates an "Invalid memory access. (write error at address 0)" error.
PB 5.70 on Win 7 Ult X64
Any thoughts or pointers would be welcome.
Best Regards
Dave
Sirius-2337
User
User
Posts: 53
Joined: Sat May 14, 2011 10:39 am

Re: Read from PDF

Post by Sirius-2337 »

bmcs wrote:I have been looking for something like this for a while. Unfortunately, when I try either of the examples by fabulouspaul or verleihnix InitLibrary() generates an "Invalid memory access. (write error at address 0)" error.
PB 5.70 on Win 7 Ult X64
Any thoughts or pointers would be welcome.
Best Regards
Dave
x64 Dll has different function names:

Code: Select all

EnableExplicit

Prototype protoInitLibrary()
Prototype protoLoadDocument(documentpath.p-ascii,password.p-utf8)
Prototype protoGetMetaText(document, tag.p-utf8, buffer, buflen)
Prototype protoGetPageCount(document)
Prototype protoLoadPage(document, page_index)
Prototype protoLoadTextPage(textpage)
Prototype protoCloseTextPage(textpage)
Prototype protoCountChars(textpage)
Prototype protoGetText(textpage, istart, iCharCnt, *result)
Prototype protoCloseDocument(document)

Procedure Main()
 
  Protected pdf_dll =OpenLibrary(#PB_Any, "pdfium64.dll")
 
  Protected InitLibrary.protoInitLibrary = GetFunction(pdf_dll,"FPDF_InitLibrary")
  Protected LoadDocument.protoLoadDocument = GetFunction(pdf_dll,"FPDF_LoadDocument")
  Protected GetMetaText.protoGetMetaText = GetFunction(pdf_dll,"FPDF_GetMetaText")
  Protected LoadPage.protoLoadPage = GetFunction(pdf_dll,"FPDF_LoadPage")
  Protected GetText.protoGetText = GetFunction(pdf_dll,"FPDFText_GetText")
  Protected LoadTextPage.protoLoadTextPage = GetFunction(pdf_dll,"FPDFText_LoadPage")
  Protected CountChars.protoCountChars = GetFunction(pdf_dll,"FPDFText_CountChars")

  Protected     igPdfDoc.i = 0 ;Handle of Loaded PDF File
  Protected    igPageNum.i = 0 ;Current page of current PDF document
  Protected igTotalPages.i = 0
  Protected    igPdfPage.i = 0 ;Handle of Loaded PDF Page
 
  Protected *result
  Protected sTxt.s
  Protected iPdfTextPage.i, iTotalChars.i, iBuffLen.i
  Protected sFullpage.s
  Protected sPart.s

  InitLibrary()
 
  Protected pdf_doc = LoadDocument("test.pdf", "")
 
  Protected buflen = 1024
  Protected *buffer = AllocateMemory(buflen)
 
  Protected ret = GetMetaText(pdf_doc,"Title",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Author",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  ;FillMemory(*buffer, buflen)
  ret = GetMetaText(pdf_doc,"Creator",*buffer, buflen)
 
  Debug PeekS(*buffer, ret/2)
 
  igPdfPage = LoadPage(pdf_doc, 0)
  iPdfTextPage = LoadTextPage(igPdfPage)
  iTotalChars = CountChars(iPdfTextPage)
  iBuffLen = (iTotalChars * SizeOf(Character))
  *result = AllocateMemory(iBuffLen)
 
  ret = GetText(iPdfTextPage, 0, 1024, *result)
 
  Debug PeekS(*result, ret/2)
 
  CloseLibrary(#PB_Any)
 
EndProcedure

Main()
Post Reply