Page 1 of 1

Google-type string testing

Posted: Sun Apr 10, 2011 2:18 am
by Seymour Clufley
This is some code I've been using for a while. It works, even for complex search parameters (although I'm sure somebody will find a bug!).

It allows you to test whether strings match a given search pattern, in the manner of a Google search. In the example provided, the search pattern is
"innocent movie-watcher" -popcorn-
which is quite complex.

Some necessary macros for processing strings are included, which people may find useful for other projects.

Code: Select all

Global c10.s = Chr(10)
Global c13.s = Chr(13)
Global c32.s = Chr(32)
Global c34.s = Chr(34)
Global c39.s = Chr(39)


Procedure.s ByteYesNo(b.b)
  
  If b
      ProcedureReturn "Yes"
  Else
      ProcedureReturn "No"
  EndIf
  
EndProcedure


Macro RecursiveReplaceThisString(t,a,b,mode=1)
	While FindString(t,a,1)
		t = ReplaceString(t,a,b,mode)
	Wend
EndMacro

Procedure.s RecursiveReplaceString(t.s,a.s,b.s,mode.b=0)
	
	RecursiveReplaceThisString(t,a,b,mode)
	
	ProcedureReturn t
	
EndProcedure


Macro EnsureThisStart(t,start)
	
	If start<>""
		If Left(t,Len(start)) <> start
			t = start+t
		EndIf
	EndIf
	
EndMacro

Macro EnsureThisEnd(t,endd)
	
	If endd<>""
		If Right(t,Len(endd)) <> endd
			t+endd
		EndIf
	EndIf
	
EndMacro

Macro EnsureThisNotStart(t,start)
	
	If Left(t,Len(start)) = start
		t = Mid(t,Len(start)+1,Len(t))
	EndIf
	
EndMacro


Macro R(t)
	MessageRequester("Report",t,0)
EndMacro





; --------------------------------------------------------------------------------------------------


Procedure.s BreakIntoWords(str.s)
	
	str = RemoveString(str,c10)
	
	Static NewList villain.s()
	If Not ListSize(villain())
		AddElement(villain()) : villain() = c13
		AddElement(villain()) : villain() = "!"
		AddElement(villain()) : villain() = "?"
		AddElement(villain()) : villain() = c34
		AddElement(villain()) : villain() = "."
		AddElement(villain()) : villain() = ","
		AddElement(villain()) : villain() = "|"
		AddElement(villain()) : villain() = "("
		AddElement(villain()) : villain() = ")"
	EndIf
	
	ForEach villain()
		str = ReplaceString(str,villain(),c32)
	Next
	
	str = c32+str+c32
	RecursiveReplaceThisString(str,c32+c32,c32)
	ProcedureReturn str
	
EndProcedure


Structure GoogleStringMatcher
	List want.s()
	List dontwant.s()
	viable.b
EndStructure


Procedure ReportGoogleSearchParameters(*g.GoogleStringMatcher,origstr.s)
	
	ssreport.s = "SEARCH STRING..."+c13+origstr+c13+c13+"Is valid for matching: "+UCase(ByteYesNo(*g\viable))+c13+c13+"Wants: "+Str(ListSize(*g\want()))+c13
	
	ForEach *g\want()
		ssreport+"    "+*g\want()+c13
	Next
	
	ssreport+c13+"Don't wants: "+Str(ListSize(*g\dontwant()))+c13
	
	ForEach *g\dontwant()
		ssreport+"    "+*g\dontwant()+c13
	Next
	
	R(ssreport)
	
EndProcedure


Procedure.b GoogleTestString(*g.GoogleStringMatcher,string.s,casemode.b,wholewordsonly.b=#True,reportstring.b=#False)
	
	If Not *g\viable
		ProcedureReturn #False
	EndIf
	
	dot.s = c32
	
	If casemode=#PB_String_NoCase
		string = LCase(string)
	EndIf
	If wholewordsonly
		string = BreakIntoWords(string)
	Else
		string = Trim(string)
	EndIf
	
	NewList want.s()
	CopyList(*g\want(),want())
	NewList dontwant.s()
	CopyList(*g\dontwant(),dontwant())
	ForEach want()
		If casemode=#PB_String_NoCase
			want() = LCase(want())
		EndIf
		If wholewordsonly
			want() = BreakIntoWords(want())
		Else
			want() = Trim(want())
		EndIf
	Next
	ForEach dontwant()
		dontwant() = *g\dontwant()
		If casemode=#PB_String_NoCase
			dontwant() = LCase(dontwant())
		EndIf
		If wholewordsonly
			dontwant() = BreakIntoWords(dontwant())
		Else
			dontwant() = Trim(dontwant())
		EndIf
	Next
	
	string = ReplaceString(string,"<p>",c32,#PB_String_NoCase)
	string = ReplaceString(string,"<br>",c32,#PB_String_NoCase)
	If reportstring : R(string) : EndIf
	
	; now search text
	ForEach want()
		If Not FindString(string,want(),0)
			ProcedureReturn #False
		EndIf
	Next
	ForEach dontwant()
		If FindString(string,dontwant(),0)
			ProcedureReturn #False
		EndIf
	Next
	
	ProcedureReturn #True
	
EndProcedure


Enumeration
	#GoogleStringParse_ModePositive
	#GoogleStringParse_ModePositivePhrase
	#GoogleStringParse_ModeNegative
	#GoogleStringParse_ModeNegativePhrase
EndEnumeration

Procedure.b ParseGoogleSearchString(string.s,*g.GoogleStringMatcher)
	;string = "nice good 'nice person' great -bad- -evil- -horrible- -'bad person'- -'villainous people'-"
	;R("Proc: ParseSearchString")
	
	InitializeStructure(*g,GoogleStringMatcher)
	
	If Not string
		*g\viable = #False
		ProcedureReturn #False
	EndIf
	
	
	string = ReplaceString(string,c39,c34)
	
	EnsureThisEnd(string,c32)
	EnsureThisStart(string,c32)
	
	string = ReplaceString(string,"-"+c34,"¬")
	string = ReplaceString(string,c34+"-","¬")
	
	string = ReplaceString(string,c32+"-",c32+"`") ; single negative words (opener)
	string = ReplaceString(string,"-"+c32,"`"+c32) ; single negative words (closer)
	
	RecursiveReplaceThisString(string,c32+c32,c32)
	;R(string)
	EnsureThisNotStart(string,c32)
	
	mode=#GoogleStringParse_ModePositive
	For a = 1 To Len(string)
		letter.s = Mid(string,a,1)
		Select letter
			Case "¬"
				Select mode ; negative phrases
					Case #GoogleStringParse_ModeNegativePhrase
						mode=#GoogleStringParse_ModePositive
						dontwants$+"|"
					Case #GoogleStringParse_ModePositive
						mode=#GoogleStringParse_ModeNegativePhrase
				EndSelect
			Case "`" ; single negative words
				Select mode
					Case #GoogleStringParse_ModeNegative
						mode=#GoogleStringParse_ModePositive
						dontwants$+"|"
					Case #GoogleStringParse_ModePositive
						mode=#GoogleStringParse_ModeNegative
				EndSelect
			Case Chr(34) ; positive phrases
				Select mode
					Case #GoogleStringParse_ModePositivePhrase
						mode=#GoogleStringParse_ModePositive
						wants$+"|"
					Case #GoogleStringParse_ModePositive
						mode=#GoogleStringParse_ModePositivePhrase
				EndSelect
			Case Chr(32)
				Select mode
					Case #GoogleStringParse_ModePositive
						wants$+"|"
					Case #GoogleStringParse_ModeNegative
						dontwants$+"|"
					Case #GoogleStringParse_ModeNegativePhrase
						dontwants$+c32
					Case #GoogleStringParse_ModePositivePhrase
						wants$+c32
				EndSelect
			Default ; single positive words
				If mode=#GoogleStringParse_ModeNegative Or mode=#GoogleStringParse_ModeNegativePhrase
					dontwants$+letter
				Else
					wants$+letter
				EndIf
		EndSelect
	Next
	
	RecursiveReplaceThisString(wants$,"||","|")
	RecursiveReplaceThisString(dontwants$,"||","|")
	
	ClearList(*g\want())
	items = CountString(wants$,"|")
	For a = 1 To items
		AddElement(*g\want())
		*g\want() = StringField(wants$,a,"|")
	Next a
	
	ClearList(*g\dontwant())
	items = CountString(dontwants$,"|")
	For a = 1 To items
		AddElement(*g\dontwant())
		*g\dontwant() = StringField(dontwants$,a,"|")
	Next a
	
	If ListSize(*g\want()) Or ListSize(*g\dontwant())
		*g\viable = #True
	EndIf
	
	ProcedureReturn *g\viable
	
EndProcedure




params.s = c34+"innocent movie-watcher"+c34+" -popcorn-"
ParseGoogleSearchString(params,@g.GoogleStringMatcher)
ReportGoogleSearchParameters(@g,params)
Debug GoogleTestString(@g,"He was watching movies.",0,#True)
Debug GoogleTestString(@g,"He was an "+c34+"innocent movie-watcher"+c34+".",0,#True)
Debug GoogleTestString(@g,"He was an "+c34+"innocent movie-watcher"+c34+", according to reports, and he liked popcorn.",0,#True)
Hope someone can use it!

Re: Google-type string testing

Posted: Sun Apr 10, 2011 3:17 am
by idle
Thanks, that will come in handy.

Re: Google-type string testing

Posted: Mon Apr 11, 2011 8:33 am
by akj
Where can I find a comprehensive summary of permitted search patterns?

Re: Google-type string testing

Posted: Mon Apr 11, 2011 9:53 am
by Trond
Why do you have a minus after popcorn? That's not how anyone else does it. Google just uses a minus in front.

Re: Google-type string testing

Posted: Mon Apr 11, 2011 5:18 pm
by Seymour Clufley
Trond wrote:Why do you have a minus after popcorn? That's not how anyone else does it. Google just uses a minus in front.
I didn't know that. I'll change the code.
akj wrote:Where can I find a comprehensive summary of permitted search patterns?
I don't know. The code caters for words and phrases that the text MUST contain, and words and phrases that the text must NOT contain.

Re: Google-type string testing

Posted: Mon Apr 11, 2011 6:23 pm
by DarkDragon
Seymour Clufley wrote:
akj wrote:Where can I find a comprehensive summary of permitted search patterns?
I don't know. The code caters for words and phrases that the text MUST contain, and words and phrases that the text must NOT contain.
I had such a list of search patterns for common search machines (and a list of which search engine supports what commands). They also accept words like AND, OR, NEAR or NOT as command!

http://www.monash.com/spidap4.html