Page 2 of 2
Re: Link Crawler Algorithm
Posted: Sat Oct 07, 2017 11:39 pm
by Bitblazer
Don't re-invent the wheel especially on a topic that can quickly tun out to be very tricky. Nowadays even CMS systems use javascript for dynamic content delivery using a desktop alike UI experience.
For simple websites - try to parse wget output with the -spider flag (like
here)
Re: Link Crawler Algorithm
Posted: Sat Oct 07, 2017 11:49 pm
by infratec
Hi,
regex.pbi is not needed anymore
Code: Select all
; Crawl (AKJ version)
; By DarkPlayer
; Website Link Crawler Algorithm
; www.purebasic.fr/english/viewtopic.php?f=13&t=44160
#Program = "Crawl"
#Version = "2.0"
EnableExplicit
;XIncludeFile "regex.pbi"
XIncludeFile "url.pbi"
;- Structure
Structure URLListEntry
Name.s ; URL absolute name
Done.i ; True iff the hyperlinks for this page have been partly or totally extracted (or if extraction is supressed)
Level.i ; AKJ: Link level >=0
Hash.i ; AKJ: Fingerprint (32 bits) of URL name to quickly determine whether URL is a duplicate
Origin.i ; AKJ: URL entry (0 = home url) from which this entry was found
EndStructure
;}
Global HomeURL.s ; Starting URL as http://<domain>/
Procedure CrawlURL(RegEx.i, URL.s, List URLList.URLListEntry(), level)
; Debug URL ; !!! AKJ
Protected origin = ListIndex(URLList())+1 ; AKJ Source URL
level + 1 ; AKJ Level of all hyperlinks within the URL page
If ReceiveHTTPFile(URL, "link.html")
Protected SizeOfPage = FileSize("link.html")
If SizeOfPage>0
Protected *Memory = AllocateMemory(SizeOfPage)
If *Memory
If ReadFile(0, "link.html")
If ReadData(0, *Memory, SizeOfPage) = SizeOfPage
Protected HTML.s = PeekS(*Memory, SizeOfPage, #PB_UTF8)
Protected HomeURLsite.s = GetURLPart(HomeURL,#PB_URL_Site) ; AKJ
Protected RegExMatch.i
RegExMatch = ExamineRegularExpression(RegEx, HTML)
If RegExMatch
While NextRegularExpressionMatch(RegEx)
Protected NewURL.s
NewURL = RegularExpressionNamedGroup(RegEx, "url") ; AKJ
NewURL = TrimURL(NewURL) ; AKJ
NewURL = RelativeURLtoAbsolute(URL, NewURL)
NewURL = Canonicalize(NewURL)
Protected Found.i = #False
;Protected hash.l = CRC32Fingerprint(@NewURL, Len(NewURL)) ; AKJ
Protected hash.i = Val("$" + StringFingerprint(NewURL, #PB_Cipher_CRC32)) ; AKJ
Protected p, url$ ; AKJ
ForEach URLList()
If URLList()\Hash=hash ; AKJ
If URLList()\Name=NewURL
Found = #True: Break ; AKJ
EndIf
EndIf ; AKJ
Next
If Not Found ; AKJ
If AddElement(URLList())
With URLList() ; AKJ
\Name = NewURL
\Done = #False
; AKJ Do not crawl current page if not on the original website
If GetURLPart(NewURL,#PB_URL_Site)<>HomeURLsite: \Done = #True: EndIf ; AKJ
\Hash = hash ; AKJ
\level = level ; AKJ
\origin = origin ; AKJ
p =FindString(NewURL, "://", 1) ; AKJ
If p: url$ = Mid(NewURL, p+3): Else: url$ = NewURL: EndIf ; AKJ
Debug Str(level)+Space(2)+"["+Str(ListIndex(URLList())+1)+" <- "+Str(origin)+"]"+Space(2)+url$ ; AKJ !!!
EndWith ; AKJ
EndIf
EndIf
Wend
;*RegExMatch\DecRef()
EndIf ; *RegExMatch
EndIf ; ReadData()
CloseFile(0)
EndIf ; ReadFile()
FreeMemory(*Memory)
EndIf ; *Memory
EndIf ; SizeOfPage
EndIf ; ReceiveHTTPFile()
EndProcedure
Procedure GetNextUrl(List URLList.URLListEntry())
ForEach URLList()
If URLList()\Done = #False
ProcedureReturn #True
EndIf
Next
ProcedureReturn #False
EndProcedure
Procedure Crawl(BaseURL.s)
NewList URLList.URLListEntry()
#StartTag = "<[aA]( [^>]*)? [hH][rR][eE][fF]=(?P<url>"+Chr(34)+"[^"+Chr(34)+"]*"+Chr(34)+"|'[^']*'|[^ >]*)[^>]*>"
#EndTag = "</[aA]( [^>]*)?>"
#NoCloseTag = "<[^/][^>]*>"
#NoACloseTag = "</([^aA][^>]*|[aA][^ >][^>]*)?>"
#ContentPart = "[^<]*(" + #NoCloseTag + "|" +#NoACloseTag + ")*"
#HyperlinkRegExp = #StartTag + "(" + #ContentPart + ")*" + #EndTag
Protected RegEx.i = CreateRegularExpression(#PB_Any, #HyperlinkRegExp)
If RegEx
HomeURL = Canonicalize(TrimURL(BaseURL)) ; AKJ
Debug "0 [0] "+HomeURL ; AKJ !!!
CrawlURL(RegEx, HomeURL, URLList(), 0) ; AKJ
While GetNextUrl(URLList())
URLList()\Done = #True
CrawlURL(RegEx, URLList()\Name, URLList(), URLList()\Level) ; AKJ
Wend
FreeRegularExpression(RegEx)
EndIf
EndProcedure
InitNetwork()
UseCRC32Fingerprint()
;Crawl("www.purebasic.com") ; AKJ
Crawl("www.paperfile.net") ; AKJ
But you should replace ReceiveHTTPFile() with ReceiveHTTPMemory().
Bernd
Re: Link Crawler Algorithm
Posted: Sat Oct 07, 2017 11:53 pm
by infratec
Ok,
done it.
Code: Select all
; Crawl (AKJ version)
; By DarkPlayer
; Website Link Crawler Algorithm
; www.purebasic.fr/english/viewtopic.php?f=13&t=44160
#Program = "Crawl"
#Version = "2.0"
EnableExplicit
;XIncludeFile "regex.pbi"
XIncludeFile "url.pbi"
;- Structure
Structure URLListEntry
Name.s ; URL absolute name
Done.i ; True iff the hyperlinks for this page have been partly or totally extracted (or if extraction is supressed)
Level.i; AKJ: Link level >=0
Hash.i ; AKJ: Fingerprint (32 bits) of URL name to quickly determine whether URL is a duplicate
Origin.i ; AKJ: URL entry (0 = home url) from which this entry was found
EndStructure
;}
Global HomeURL.s ; Starting URL as http://<domain>/
Procedure CrawlURL(RegEx.i, URL.s, List URLList.URLListEntry(), level)
; Debug URL ; !!! AKJ
Protected origin = ListIndex(URLList())+1 ; AKJ Source URL
level + 1 ; AKJ Level of all hyperlinks within the URL page
Protected *Memory = ReceiveHTTPMemory(URL)
If *Memory
Protected HTML.s = PeekS(*Memory, MemorySize(*Memory), #PB_UTF8)
Protected HomeURLsite.s = GetURLPart(HomeURL,#PB_URL_Site) ; AKJ
Protected RegExMatch.i
RegExMatch = ExamineRegularExpression(RegEx, HTML)
If RegExMatch
While NextRegularExpressionMatch(RegEx)
Protected NewURL.s
NewURL = RegularExpressionNamedGroup(RegEx, "url") ; AKJ
NewURL = TrimURL(NewURL) ; AKJ
NewURL = RelativeURLtoAbsolute(URL, NewURL)
NewURL = Canonicalize(NewURL)
Protected Found.i = #False
;Protected hash.l = CRC32Fingerprint(@NewURL, Len(NewURL)) ; AKJ
Protected hash.i = Val("$" + StringFingerprint(NewURL, #PB_Cipher_CRC32)) ; AKJ
Protected p, url$ ; AKJ
ForEach URLList()
If URLList()\Hash=hash ; AKJ
If URLList()\Name=NewURL
Found = #True: Break ; AKJ
EndIf
EndIf ; AKJ
Next
If Not Found ; AKJ
If AddElement(URLList())
With URLList() ; AKJ
\Name = NewURL
\Done = #False
; AKJ Do not crawl current page if not on the original website
If GetURLPart(NewURL,#PB_URL_Site)<>HomeURLsite: \Done = #True: EndIf ; AKJ
\Hash = hash ; AKJ
\level = level ; AKJ
\origin = origin ; AKJ
p =FindString(NewURL, "://", 1) ; AKJ
If p: url$ = Mid(NewURL, p+3): Else: url$ = NewURL: EndIf ; AKJ
Debug Str(level)+Space(2)+"["+Str(ListIndex(URLList())+1)+" <- "+Str(origin)+"]"+Space(2)+url$ ; AKJ !!!
EndWith ; AKJ
EndIf
EndIf
Wend
;*RegExMatch\DecRef()
EndIf ; *RegExMatch
FreeMemory(*Memory)
EndIf ; ReceiveHTTPFile()
EndProcedure
Procedure GetNextUrl(List URLList.URLListEntry())
ForEach URLList()
If URLList()\Done = #False
ProcedureReturn #True
EndIf
Next
ProcedureReturn #False
EndProcedure
Procedure Crawl(BaseURL.s)
NewList URLList.URLListEntry()
#StartTag = "<[aA]( [^>]*)? [hH][rR][eE][fF]=(?P<url>"+Chr(34)+"[^"+Chr(34)+"]*"+Chr(34)+"|'[^']*'|[^ >]*)[^>]*>"
#EndTag = "</[aA]( [^>]*)?>"
#NoCloseTag = "<[^/][^>]*>"
#NoACloseTag = "</([^aA][^>]*|[aA][^ >][^>]*)?>"
#ContentPart = "[^<]*(" + #NoCloseTag + "|" +#NoACloseTag + ")*"
#HyperlinkRegExp = #StartTag + "(" + #ContentPart + ")*" + #EndTag
Protected RegEx.i = CreateRegularExpression(#PB_Any, #HyperlinkRegExp)
If RegEx
HomeURL = Canonicalize(TrimURL(BaseURL)) ; AKJ
Debug "0 [0] "+HomeURL ; AKJ !!!
CrawlURL(RegEx, HomeURL, URLList(), 0) ; AKJ
While GetNextUrl(URLList())
URLList()\Done = #True
CrawlURL(RegEx, URLList()\Name, URLList(), URLList()\Level) ; AKJ
Wend
FreeRegularExpression(RegEx)
EndIf
EndProcedure
InitNetwork()
UseCRC32Fingerprint()
;Crawl("www.purebasic.com") ; AKJ
Crawl("www.paperfile.net") ; AKJ
Bernd
Re: Link Crawler Algorithm
Posted: Sun Oct 08, 2017 2:55 am
by vwidmer
Wow Thanks