I needed to use Link Crawler to help me in a project I'm undertaking but it has three major drawbacks for me:
1. It crawls hyperlinks of remote sites linked to the base URL, potentially resulting in an enormous output result.
2. It is too slow.
3. Insufficient information is returned to suit my needs.
These issues are [at least partially] resolved in my version of the program, given later in this post:
1. Web pages are not crawled if their domain differs from that of the base URL. This is achieved by immediately setting the 'Done' flag in the URLList() structure for such pages.
2. The program is speeded up by replacing the code that scans URLList() looking for duplicate URLs. It was changed from performing string comparisons of URL names to performing 32-binary integer comparisons of URL names hashed using the Purebasic built-in function CRC32Fingerprint().
3. Extra information is output for each web page detected in the crawl as follows:
* A link nesting level that records the number of hyperlinks followed from the base url page to reach the current page
* A simple page counter where the base URL page is page 0, the first hyperlinked page detected is page 1, the next page detected is page 2 and so on.
* A record of which page contains the hyperlink pointing to the current page. For example if page 2 hyperlinked to page 9, then the latter is recorded as [9 <- 2]. If multiple pages hyperlink to the current page, only the first is recorded.
The main program has extensive changes (each annotated with my initials AKJ), url.pbi has a few changes and regex.pbi has no changes but is repeated below for convenience.
I have also changed the test base URL from
http://www.purebasic.com to
http://www.paperfile.net as this is more convenient for detecting programming bugs.
Main program:
Code: Select all
; Crawl (AKJ version)
; By DarkPlayer
; Website Link Crawler Algorithm
; www.purebasic.fr/english/viewtopic.php?f=13&t=44160
#Program = "Crawl"
#Version = "2.0"
EnableExplicit
XIncludeFile "regex.pbi"
XIncludeFile "url.pbi"
;- Structure
Structure URLListEntry
Name.s ; URL absolute name
Done.i ; True iff the hyperlinks for this page have been partly or totally extracted (or if extraction is supressed)
Level.i ; AKJ: Link level >=0
Hash.l ; AKJ: Fingerprint (32 bits) of URL name to quickly determine whether URL is a duplicate
Origin.i ; AKJ: URL entry (0 = home url) from which this entry was found
EndStructure
;}
Global HomeURL.s ; Starting URL as http://<domain>/
Procedure CrawlURL(*RegEx.IRegEx, URL.s, List URLList.URLListEntry(), level)
; Debug URL ; !!! AKJ
Protected origin = ListIndex(URLList())+1 ; AKJ Source URL
level + 1 ; AKJ Level of all hyperlinks within the URL page
If ReceiveHTTPFile(URL, "link.html")
Protected SizeOfPage = FileSize("link.html")
If SizeOfPage>0
Protected *Memory = AllocateMemory(SizeOfPage)
If *Memory
If ReadFile(0, "link.html")
If ReadData(0, *Memory, SizeOfPage) = SizeOfPage
Protected HTML.s = PeekS(*Memory, SizeOfPage)
Protected HomeURLsite.s = GetURLPart(HomeURL,#PB_URL_Site) ; AKJ
Protected *RegExMatch.IRegExMatch
*RegExMatch = *RegEX\Match(HTML)
If *RegExMatch
While *RegExMatch\Next()
Protected NewURL.s
NewURL = *RegExMatch\GetSubstringWithName("url") ; AKJ
NewURL = TrimURL(NewURL) ; AKJ
NewURL = RelativeURLtoAbsolute(URL, NewURL)
NewURL = Canonicalize(NewURL)
Protected Found.i = #False
Protected hash.l = CRC32Fingerprint(@NewURL, Len(NewURL)) ; AKJ
Protected p, url$ ; AKJ
ForEach URLList()
If URLList()\Hash=hash ; AKJ
If URLList()\Name=NewURL
Found = #True: Break ; AKJ
EndIf
EndIf ; AKJ
Next
If Not Found ; AKJ
If AddElement(URLList())
With URLList() ; AKJ
\Name = NewURL
\Done = #False
; AKJ Do not crawl current page if not on the original website
If GetURLPart(NewURL,#PB_URL_Site)<>HomeURLsite: \Done = #True: EndIf ; AKJ
\Hash = hash ; AKJ
\level = level ; AKJ
\origin = origin ; AKJ
p =FindString(NewURL, "://", 1) ; AKJ
If p: url$ = Mid(NewURL, p+3): Else: url$ = NewURL: EndIf ; AKJ
Debug Str(level)+Space(2)+"["+Str(ListIndex(URLList())+1)+" <- "+Str(origin)+"]"+Space(2)+url$ ; AKJ !!!
EndWith ; AKJ
EndIf
EndIf
Wend
*RegExMatch\DecRef()
EndIf ; *RegExMatch
EndIf ; ReadData()
CloseFile(0)
EndIf ; ReadFile()
FreeMemory(*Memory)
EndIf ; *Memory
EndIf ; SizeOfPage
EndIf ; ReceiveHTTPFile()
EndProcedure
Procedure GetNextUrl(List URLList.URLListEntry())
ForEach URLList()
If URLList()\Done = #False
ProcedureReturn #True
EndIf
Next
ProcedureReturn #False
EndProcedure
Procedure Crawl(BaseURL.s)
NewList URLList.URLListEntry()
#StartTag = "<[aA]( [^>]*)? [hH][rR][eE][fF]=(?P<url>"+Chr(34)+"[^"+Chr(34)+"]*"+Chr(34)+"|'[^']*'|[^ >]*)[^>]*>"
#EndTag = "</[aA]( [^>]*)?>"
#NoCloseTag = "<[^/][^>]*>"
#NoACloseTag = "</([^aA][^>]*|[aA][^ >][^>]*)?>"
#ContentPart = "[^<]*(" + #NoCloseTag + "|" +#NoACloseTag + ")*"
#HyperlinkRegExp = #StartTag + "(" + #ContentPart + ")*" + #EndTag
Protected *RegEx.IRegEx = RegEx_Create(#HyperlinkRegExp)
If *RegEx
HomeURL = Canonicalize(TrimURL(BaseURL)) ; AKJ
Debug "0 [0] "+HomeURL ; AKJ !!!
CrawlURL(*RegEx, HomeURL, URLList(), 0) ; AKJ
While GetNextUrl(URLList())
URLList()\Done = #True
CrawlURL(*RegEx, URLList()\Name, URLList(), URLList()\Level) ; AKJ
Wend
*RegEx\DecRef()
EndIf ; *RegEx
EndProcedure
InitNetwork()
; Crawl("www.purebasic.com") ; AKJ
Crawl("www.paperfile.net") ; AKJ
End
url.pbi
Code: Select all
; url.pbi (AKJ version)
; By DarkPlayer
Procedure.i _URL_IsNumeric(String.s)
If String = ""
ProcedureReturn #False
EndIf
Protected i.i
For I=1 To Len(String)
If FindString("0123456789", Mid(String, I, 1), 1) = 0
ProcedureReturn #False
EndIf
Next
ProcedureReturn #True
EndProcedure
Procedure _URL_HexToVal(String.s)
String = LCase(String)
Protected I.i, Value.i
For I = 1 To Len(String)
Protected Pos.i = FindString("0123456789abcdef", Mid(String, I, 1), 1)
If Pos = 0
ProcedureReturn 0
EndIf
Value << 4
Value | (Pos - 1)
Next
ProcedureReturn Value
EndProcedure
Procedure.i _URL_IsHex(String.s)
If String = ""
ProcedureReturn #False
EndIf
Protected i.i
For I=1 To Len(String)
If FindString("0123456789abcdefABCDEF", Mid(String, I, 1), 1) = 0
ProcedureReturn #False
EndIf
Next
ProcedureReturn #True
EndProcedure
Procedure.i BSWAP_Long(Value.i)
!MOV eax, dword[p.v_Value]
!BSWAP eax
!MOV dword[p.v_Value], eax
ProcedureReturn Value
EndProcedure
Procedure.s MyURLEncoder(String.s, Host.i = #False)
If Host = #False
String = Trim(String)
EndIf
Protected i.i = 1
While i <= Len(String)
Protected MyAsc.i = Asc(Mid(String,i,1))
If Host
If MyAsc < 32 Or MyAsc >= 127 Or MyAsc = '#' Or MyAsc = '%' Or MyAsc = '\' Or MyAsc = ' '
If MyAsc = '\' And i + 1 <= Len(String) And Mid(String,i+1,1) = "x"
String = Mid(String,1,i - 1) + "%" + Mid(String,i+2)
i + 3
Else
String = Mid(String,1,i - 1) + "%"+UCase(RSet(Hex(MyAsc),2,"0")) + Mid(String,i+1)
i + 3
EndIf
Else
i + 1
EndIf
Else
If MyAsc < 32 Or MyAsc >= 127 Or MyAsc = '#' Or MyAsc = '%' Or MyAsc = '\' Or MyAsc = ' '
If MyAsc = '\'
String = Mid(String,1,i - 1) + Mid(String,i+2)
i + 2
Else
String = Mid(String,1,i - 1) + "%"+UCase(RSet(Hex(MyAsc),2,"0")) + Mid(String,i+1)
i + 3
EndIf
Else
i + 1
EndIf
EndIf
Wend
ProcedureReturn String
EndProcedure
Procedure.s TrimURL(URL.s) ; AKJ
URL = Trim(URL, #DQUOTE$) ; AKJ
URL = Trim(URL) ; AKJ
URL = Trim(URL, "'") ; AKJ
URL = Trim(URL) ; AKJ
ProcedureReturn URL ; AKJ
EndProcedure ; AKJ
Procedure.s Canonicalize(OrgURL.s)
Protected RautePos.i = FindString(OrgURL,"#",1)
If RautePos
OrgURL = Mid(OrgURL,1,RautePos-1)
OrgURL = Trim(OrgURL)
EndIf
OrgURL = Trim(OrgUrl)
Protected URL.s = URLDecoder(OrgURL)
Protected Position.i
Protected OldPosition.i
Protected Site.s
Protected Path.s
Protected NewUrl.s
If GetURLPart(URL,#PB_URL_Protocol) = "" ; AKJ
URL = "http://" + URL
EndIf
;-Hostname Canonicalize
Site.s = GetURLPart(URL,#PB_URL_Site)
Site = RTrim(Site,".")
Site = LTrim(Site,".")
Position = FindString(Site,".",1)
While Position
Protected i.i = Position + 1
While Mid(Site,i,1) = "."
i + 1
Wend
If i > Position + 1
Site = Mid(Site, 1, Position) + Mid(Site,i)
EndIf
Position = FindString(Site,".",Position + 1)
Wend
Site = LCase(Site)
If _URL_IsNumeric(Site)
Site = IPString(BSWAP_Long(Val(Site)))
ElseIf _URL_IsHex(Site)
Site = IPString(BSWAP_Long(_URL_HexToVal(Site)))
EndIf
;-Path Canonicalize
Protected NewPath.s
Protected Ignore.i = 0
Path.s = GetURLPart(URL, #PB_URL_Path)
Path.s = ReplaceString(Path, "/./","/")
For i = CountString(Path, "/") + 1 To 1 Step - 1
If i <> 1 Or StringField(Path, i, "/") <> ""
If StringField(Path, i, "/") = ".."
Ignore + 1
Else
If Ignore > 0
Ignore - 1
Else
If i <> CountString(Path, "/") + 1
If StringField(Path, i, "/")
NewPath = StringField(Path, i, "/") + "/" + NewPath
EndIf
Else
NewPath = StringField(Path, i, "/") + NewPath
EndIf
EndIf
EndIf
EndIf
Next
For i = 1 To Ignore
If i <> 1
NewPath = "/../" + NewPath
Else
NewPath = "../" + NewPath
EndIf
Next
If GetURLPart(OrgURL, #PB_URL_Protocol) <> ""
NewUrl = GetURLPart(OrgURL,#PB_URL_Protocol)+"://"+MyURLEncoder(Site,#True)
Else
NewUrl = "http://"+MyURLEncoder(Site,#True)
EndIf
;Add Port if specified
If GetURLPart(URL, #PB_URL_Port) <> ""
NewUrl + ":"+ GetURLPart(URL, #PB_URL_Port)
EndIf
NewUrl+"/"+ MyURLEncoder(NewPath)
;Add Parameters if specified
If FindString(OrgUrl,"?",1)
NewUrl + "?"+ MyURLEncoder(GetURLPart(URL, #PB_URL_Parameters))
EndIf
ProcedureReturn NewUrl
EndProcedure
Procedure.s GetBaseURL(URL.s)
Protected NewPath.s = ""
Protected part.i
For part = 1 To CountString(URL, "/")
NewPath + StringField(URL, part, "/") + "/"
Next
ProcedureReturn NewPath
EndProcedure
Procedure.s RelativeURLtoAbsolute(CurrentURL.s, RelativeURL.s)
Protected NewURL.s =""
; RelativeURL = Trim(RelativeURL) ; AKJ
If GetURLPart(RelativeURL, #PB_URL_Site)
;This URL is absolute, so just return it
ProcedureReturn RelativeURL
EndIf
If Mid(RelativeURL,1,1) = "/"
;This is relative to the host
If GetURLPart(CurrentURL, #PB_URL_Protocol)
NewURL + GetURLPart(CurrentURL, #PB_URL_Protocol) +"://"
EndIf
NewURL + GetURLPart(CurrentURL, #PB_URL_Site)
If GetURLPart(CurrentURL, #PB_URL_Port) <> ""
NewUrl + ":"+ GetURLPart(CurrentURL, #PB_URL_Port)
EndIf
NewURL + RelativeURL
ProcedureReturn NewURL
Else
ProcedureReturn GetBaseURL(CurrentURL) + RelativeURL
EndIf
EndProcedure
regex.pbi
Code: Select all
; Regex.pbi
; These Procedures are from a selfwritten Userlib, but they are not needed for single threaded apps
; I just replaced them with an almost dummy function for single threaded applications
CompilerIf Defined(LockDecInt, #PB_Function) = #False
Procedure.i LockDecInt(*Pointer.INTEGER)
*Pointer\i - 1
If *Pointer\i = 0
ProcedureReturn #True
EndIf
ProcedureReturn #False
EndProcedure
CompilerEndIf
CompilerIf Defined(LockIncInt, #PB_Function) = #False
Procedure.i LockIncInt(*Pointer.INTEGER)
*Pointer\i + 1
EndProcedure
CompilerEndIf
PrototypeC pcre_freep(*free)
ImportC ""
pcre_compile(pattern.p-ascii, options.i, *errptr, *erroffset, *tableptr)
pcre_exec(*code, *extra, subject.p-ascii, length.i, startoffset.i, options.i, *ovector, ovecsize.i)
pcre_get_substring(subject.p-ascii, *ovector, stringcount.i, stringnumber.i, *stringptr)
pcre_get_named_substring(*code, subject.p-ascii, *ovector, stringcount.i, stringname.p-ascii, *stringptr)
pcre_free_substring(*free)
pcre_free.pcre_freep
EndImport
Structure RegEx
VTable.i
*RegExCode.i
RefCounter.i
EndStructure
#REGEX_MAX = 300
Structure RegExMatch
VTable.i
*RegEx.RegEx
Result.i[#REGEX_MAX]
String.s
RefCounter.i
SubStrings.i
CurPos.i
EndStructure
Interface IRegEx
DecRef()
IncRef()
Match(String.s)
EndInterface
Interface IRegExMatch
DecRef()
IncRef()
Next()
CountSubstrings()
GetSubstring.s(Nr.i)
GetSubstringWithName.s(Name.s)
EndInterface
;-Regular Expression
Procedure.i RegEx_DecRef(*This.RegEx)
If LockDecInt(@*This\RefCounter)
pcre_free(*This\RegExCode)
ClearStructure(*This, RegEx)
FreeMemory(*This)
EndIf
EndProcedure
Procedure.i RegEx_IncRef(*This.RegEx) ; Dont change this!
LockIncInt(@*This\RefCounter)
ProcedureReturn *This
EndProcedure
Procedure RegEx_Create(String.s)
Protected *MyRegEx.RegEx = AllocateMemory(SizeOf(RegEx))
If *MyRegEx
*MyRegEx\VTable = ?RegEx
Protected Error.i
Protected ErrorOffset.i
*MyRegEx\RegExCode = pcre_compile(String, 0, @Error, @ErrorOffset, 0 )
*MyRegEx\RefCounter = 1
If *MyRegEx\RegExCode
ProcedureReturn *MyRegEx
EndIf
FreeMemory(*MyRegEx)
EndIf
ProcedureReturn #False
EndProcedure
Procedure RegEx_Match(*This.RegEx, String.s)
Protected *MyRegExMatch.RegExMatch = AllocateMemory(SizeOf(RegExMatch))
If *MyRegExMatch
InitializeStructure(*MyRegExMatch, RegExMatch)
*MyRegExMatch\VTable = ?RegExMatch
*MyRegExMatch\RefCounter = 1
*MyRegExMatch\String = String
RegEx_IncRef(*This)
*MyRegExMatch\RegEx = *This
*MyRegExMatch\CurPos = 0
ProcedureReturn *MyRegExMatch
EndIf
ProcedureReturn #False
EndProcedure
;-Matches
Procedure.i RegExMatch_DecRef(*This.RegExMatch)
If LockDecInt(@*This\RefCounter)
RegEx_DecRef(*This\RegEx)
ClearStructure(*This, RegExMatch)
FreeMemory(*This)
EndIf
EndProcedure
Procedure.i RegExMatch_IncRef(*This.RegExMatch)
LockIncInt(@*This\RefCounter)
ProcedureReturn *This
EndProcedure
Procedure RegExMatch_Next(*This.RegExMatch)
If *This
*This\SubStrings = pcre_exec(*This\RegEx\RegExCode, 0, *This\String, StringByteLength(*This\String,#PB_Ascii), *This\CurPos, 0, @*This\Result[0], #REGEX_MAX)
If *This\SubStrings > 0
*This\CurPos = *This\Result[1]
ProcedureReturn #True
Else
*This\SubStrings = 0
EndIf
EndIf
ProcedureReturn #False
EndProcedure
Procedure RegExMatch_CountSubstrings(*This.RegExMatch)
If *This
ProcedureReturn *This\SubStrings / 2
EndIf
EndProcedure
Procedure.s RegExMatch_GetSubString(*This.RegExMatch, Nr.i)
If *This And *This\SubStrings
If Nr < *This\SubStrings / 2
Protected String.i
Protected Length.i
Length = pcre_get_substring(*This\String, @*This\Result[0], *This\SubStrings, Nr, @String)
If Length > 0
Protected SubString.s = PeekS(String, Length, #PB_Ascii)
pcre_free_substring(String)
ProcedureReturn SubString
EndIf
EndIf
EndIf
ProcedureReturn ""
EndProcedure
Procedure.s RegExMatch_GetSubStringWithName(*This.RegExMatch, Name.s)
If *This And *This\SubStrings
Protected String.i
Protected Length.i
Length = pcre_get_named_substring(*This\RegEx\RegExCode, *This\String, @*This\Result[0], *This\SubStrings, Name, @String)
If Length > 0
Protected SubString.s = PeekS(String, Length, #PB_Ascii)
pcre_free_substring(String)
ProcedureReturn SubString
EndIf
EndIf
ProcedureReturn ""
EndProcedure
Procedure IncludeRegEX()
CreateRegularExpression(0, "")
EndProcedure
DataSection
RegEx:
Data.i @RegEx_DecRef()
Data.i @RegEx_IncRef()
Data.i @RegEx_Match()
RegExMatch:
Data.i @RegExMatch_DecRef()
Data.i @RegExMatch_IncRef()
Data.i @RegExMatch_Next()
Data.i @RegExMatch_CountSubstrings()
Data.i @RegExMatch_GetSubString()
Data.i @RegExMatch_GetSubStringWithName()
EndDataSection