Page 2 of 2

Re: Link Crawler Algorithm

Posted: Sat Oct 07, 2017 11:39 pm
by Bitblazer
Don't re-invent the wheel especially on a topic that can quickly tun out to be very tricky. Nowadays even CMS systems use javascript for dynamic content delivery using a desktop alike UI experience.

For simple websites - try to parse wget output with the -spider flag (like here)

Re: Link Crawler Algorithm

Posted: Sat Oct 07, 2017 11:49 pm
by infratec
Hi,

regex.pbi is not needed anymore :mrgreen:

Code: Select all

; Crawl (AKJ version)
; By DarkPlayer
; Website Link Crawler Algorithm
; www.purebasic.fr/english/viewtopic.php?f=13&t=44160

#Program = "Crawl"
#Version = "2.0"
EnableExplicit

;XIncludeFile "regex.pbi"
XIncludeFile "url.pbi"

;- Structure
Structure URLListEntry
  Name.s ; URL absolute name
  Done.i ; True iff the hyperlinks for this page have been partly or totally extracted (or if extraction is supressed)
  Level.i ; AKJ: Link level >=0
  Hash.i ; AKJ: Fingerprint (32 bits) of URL name to quickly determine whether URL is a duplicate
  Origin.i ; AKJ: URL entry (0 = home url) from which this entry was found
EndStructure
;}

Global HomeURL.s ; Starting URL as http://<domain>/

Procedure CrawlURL(RegEx.i, URL.s, List URLList.URLListEntry(), level)

 ; Debug URL ; !!! AKJ

  Protected origin = ListIndex(URLList())+1 ; AKJ Source URL
  level + 1 ; AKJ Level of all hyperlinks within the URL page

  If ReceiveHTTPFile(URL, "link.html")

    Protected SizeOfPage = FileSize("link.html")

    If  SizeOfPage>0

      Protected *Memory = AllocateMemory(SizeOfPage)

      If *Memory

        If ReadFile(0, "link.html")

          If ReadData(0, *Memory, SizeOfPage) = SizeOfPage

            Protected HTML.s = PeekS(*Memory, SizeOfPage, #PB_UTF8)
            Protected HomeURLsite.s = GetURLPart(HomeURL,#PB_URL_Site) ; AKJ

            Protected RegExMatch.i
            RegExMatch = ExamineRegularExpression(RegEx, HTML)

            If RegExMatch

              While NextRegularExpressionMatch(RegEx)
                Protected NewURL.s
                NewURL =  RegularExpressionNamedGroup(RegEx, "url") ; AKJ
                NewURL = TrimURL(NewURL) ; AKJ
                NewURL = RelativeURLtoAbsolute(URL, NewURL)
                NewURL = Canonicalize(NewURL)

                Protected Found.i = #False
                ;Protected hash.l = CRC32Fingerprint(@NewURL, Len(NewURL)) ; AKJ
                Protected hash.i = Val("$" + StringFingerprint(NewURL, #PB_Cipher_CRC32)) ; AKJ
                Protected p, url$ ; AKJ
                ForEach URLList()
                  If URLList()\Hash=hash ; AKJ
                    If URLList()\Name=NewURL
                      Found = #True: Break ; AKJ
                    EndIf
                  EndIf ; AKJ
                Next

                If Not Found ; AKJ
                  If AddElement(URLList())
                    With URLList() ; AKJ
                      \Name = NewURL
                      \Done = #False
                      ; AKJ  Do not crawl current page if not on the original website
                      If GetURLPart(NewURL,#PB_URL_Site)<>HomeURLsite: \Done = #True: EndIf ; AKJ
                      \Hash = hash ; AKJ
                      \level = level ; AKJ
                      \origin = origin ; AKJ
                      p =FindString(NewURL, "://", 1) ; AKJ
                      If p: url$ = Mid(NewURL, p+3): Else: url$ = NewURL: EndIf ; AKJ
                      Debug Str(level)+Space(2)+"["+Str(ListIndex(URLList())+1)+" <- "+Str(origin)+"]"+Space(2)+url$ ; AKJ !!!
                    EndWith ; AKJ
                  EndIf
                EndIf

              Wend

              ;*RegExMatch\DecRef()
            EndIf ; *RegExMatch

          EndIf ; ReadData()

          CloseFile(0)

        EndIf ; ReadFile()

        FreeMemory(*Memory)
      EndIf ; *Memory

    EndIf ; SizeOfPage

  EndIf ; ReceiveHTTPFile()

EndProcedure


Procedure GetNextUrl(List URLList.URLListEntry())

  ForEach URLList()
    If URLList()\Done = #False
      ProcedureReturn #True
    EndIf
  Next
  ProcedureReturn #False

EndProcedure


Procedure Crawl(BaseURL.s)

  NewList URLList.URLListEntry()

  #StartTag         = "<[aA]( [^>]*)? [hH][rR][eE][fF]=(?P<url>"+Chr(34)+"[^"+Chr(34)+"]*"+Chr(34)+"|'[^']*'|[^ >]*)[^>]*>"
  #EndTag           = "</[aA]( [^>]*)?>"
  #NoCloseTag       = "<[^/][^>]*>"
  #NoACloseTag      = "</([^aA][^>]*|[aA][^ >][^>]*)?>"
  #ContentPart      = "[^<]*(" + #NoCloseTag + "|" +#NoACloseTag + ")*"
  #HyperlinkRegExp  = #StartTag + "(" + #ContentPart + ")*" + #EndTag
  
  Protected RegEx.i = CreateRegularExpression(#PB_Any, #HyperlinkRegExp)
  
  If RegEx
    HomeURL = Canonicalize(TrimURL(BaseURL)) ; AKJ
    Debug "0  [0]  "+HomeURL ; AKJ !!!
    CrawlURL(RegEx, HomeURL, URLList(), 0) ; AKJ

    While GetNextUrl(URLList())
      URLList()\Done = #True
      CrawlURL(RegEx, URLList()\Name, URLList(), URLList()\Level) ; AKJ
    Wend

    FreeRegularExpression(RegEx)
  EndIf

EndProcedure

InitNetwork()

UseCRC32Fingerprint()

;Crawl("www.purebasic.com") ; AKJ
Crawl("www.paperfile.net") ; AKJ

But you should replace ReceiveHTTPFile() with ReceiveHTTPMemory().

Bernd

Re: Link Crawler Algorithm

Posted: Sat Oct 07, 2017 11:53 pm
by infratec
Ok,

done it.

Code: Select all

; Crawl (AKJ version)
; By DarkPlayer
; Website Link Crawler Algorithm
; www.purebasic.fr/english/viewtopic.php?f=13&t=44160

#Program = "Crawl"
#Version = "2.0"
EnableExplicit

;XIncludeFile "regex.pbi"
XIncludeFile "url.pbi"

;- Structure
Structure URLListEntry
  Name.s ; URL absolute name
  Done.i ; True iff the hyperlinks for this page have been partly or totally extracted (or if extraction is supressed)
  Level.i; AKJ: Link level >=0
  Hash.i ; AKJ: Fingerprint (32 bits) of URL name to quickly determine whether URL is a duplicate
  Origin.i ; AKJ: URL entry (0 = home url) from which this entry was found
EndStructure
;}

Global HomeURL.s ; Starting URL as http://<domain>/

Procedure CrawlURL(RegEx.i, URL.s, List URLList.URLListEntry(), level)
  
  ; Debug URL ; !!! AKJ
  
  Protected origin = ListIndex(URLList())+1 ; AKJ Source URL
  level + 1                                 ; AKJ Level of all hyperlinks within the URL page
  
  Protected *Memory = ReceiveHTTPMemory(URL)
  
  If *Memory
    
    Protected HTML.s = PeekS(*Memory, MemorySize(*Memory), #PB_UTF8)
    Protected HomeURLsite.s = GetURLPart(HomeURL,#PB_URL_Site) ; AKJ
    
    Protected RegExMatch.i
    RegExMatch = ExamineRegularExpression(RegEx, HTML)
    
    If RegExMatch
      
      While NextRegularExpressionMatch(RegEx)
        Protected NewURL.s
        NewURL =  RegularExpressionNamedGroup(RegEx, "url") ; AKJ
        NewURL = TrimURL(NewURL)                            ; AKJ
        NewURL = RelativeURLtoAbsolute(URL, NewURL)
        NewURL = Canonicalize(NewURL)
        
        Protected Found.i = #False
        ;Protected hash.l = CRC32Fingerprint(@NewURL, Len(NewURL)) ; AKJ
        Protected hash.i = Val("$" + StringFingerprint(NewURL, #PB_Cipher_CRC32)) ; AKJ
        Protected p, url$                                                         ; AKJ
        ForEach URLList()
          If URLList()\Hash=hash ; AKJ
            If URLList()\Name=NewURL
              Found = #True: Break ; AKJ
            EndIf
          EndIf ; AKJ
        Next
        
        If Not Found ; AKJ
          If AddElement(URLList())
            With URLList() ; AKJ
              \Name = NewURL
              \Done = #False
              ; AKJ  Do not crawl current page if not on the original website
              If GetURLPart(NewURL,#PB_URL_Site)<>HomeURLsite: \Done = #True: EndIf ; AKJ
              \Hash = hash                                                          ; AKJ
              \level = level                                                        ; AKJ
              \origin = origin                                                      ; AKJ
              p =FindString(NewURL, "://", 1)                                       ; AKJ
              If p: url$ = Mid(NewURL, p+3): Else: url$ = NewURL: EndIf             ; AKJ
              Debug Str(level)+Space(2)+"["+Str(ListIndex(URLList())+1)+" <- "+Str(origin)+"]"+Space(2)+url$ ; AKJ !!!
            EndWith                                                                                          ; AKJ
          EndIf
        EndIf
        
      Wend
      
      ;*RegExMatch\DecRef()
    EndIf ; *RegExMatch
    
    
    
    FreeMemory(*Memory)
    
  EndIf ; ReceiveHTTPFile()
  
EndProcedure


Procedure GetNextUrl(List URLList.URLListEntry())
  
  ForEach URLList()
    If URLList()\Done = #False
      ProcedureReturn #True
    EndIf
  Next
  ProcedureReturn #False
  
EndProcedure


Procedure Crawl(BaseURL.s)
  
  NewList URLList.URLListEntry()
  
  #StartTag         = "<[aA]( [^>]*)? [hH][rR][eE][fF]=(?P<url>"+Chr(34)+"[^"+Chr(34)+"]*"+Chr(34)+"|'[^']*'|[^ >]*)[^>]*>"
  #EndTag           = "</[aA]( [^>]*)?>"
  #NoCloseTag       = "<[^/][^>]*>"
  #NoACloseTag      = "</([^aA][^>]*|[aA][^ >][^>]*)?>"
  #ContentPart      = "[^<]*(" + #NoCloseTag + "|" +#NoACloseTag + ")*"
  #HyperlinkRegExp  = #StartTag + "(" + #ContentPart + ")*" + #EndTag
  
  Protected RegEx.i = CreateRegularExpression(#PB_Any, #HyperlinkRegExp)
  
  If RegEx
    HomeURL = Canonicalize(TrimURL(BaseURL)) ; AKJ
    Debug "0  [0]  "+HomeURL                 ; AKJ !!!
    CrawlURL(RegEx, HomeURL, URLList(), 0)   ; AKJ
    
    While GetNextUrl(URLList())
      URLList()\Done = #True
      CrawlURL(RegEx, URLList()\Name, URLList(), URLList()\Level) ; AKJ
    Wend
    
    FreeRegularExpression(RegEx)
  EndIf
  
EndProcedure

InitNetwork()

UseCRC32Fingerprint()

;Crawl("www.purebasic.com") ; AKJ
Crawl("www.paperfile.net") ; AKJ

Bernd

Re: Link Crawler Algorithm

Posted: Sun Oct 08, 2017 2:55 am
by vwidmer
Wow Thanks