Converting a search query to PostgreSQL to_tsquery

Share your advanced PureBasic knowledge/code with the community.
User avatar
flaith
Enthusiast
Enthusiast
Posts: 704
Joined: Mon Apr 25, 2005 9:28 pm
Location: $300:20 58 FC 60 - Rennes
Contact:

Converting a search query to PostgreSQL to_tsquery

Post by flaith »

Hi :)

inspired by this code (Converting a google search query to PostgreSQL tsquery), I've added the code that generate the Lexer :

Code: Select all

 ; Program       : PROCEDURES_ParseQuery.pbi
; Comment       : Generates PostgreSql to_tsquery
; Author        : Flaith
; Create        : 24.11.2013
; Update        : 27.11.2013
; Inspired of the PHP code from Peter Bailey
; http://stackoverflow.com/questions/207817/converting-a-google-search-query-to-a-postgresql-tsquery#tab-top

;-Modules declaration
DeclareModule HandleString
  ; sPrintf
  ; =======
  ; From mk-soft
  ; http://www.purebasic.fr/english/viewtopic.php?p=241016
  Declare.s sPrintf(text.s, *value1 = 0, *value2 = 0, *value3 = 0, *value4 = 0, *value5 = 0, *value6 = 0, *value7 = 0, *value8 = 0, *value9 = 0, *value10 = 0, *value11 = 0 )

  ; L/R/TrimChars
  ; =============
  ; From Little John
  ; http://www.purebasic.fr/english/viewtopic.php?p=411500#p411500
  #WHITESPACE$ = " " + #TAB$ + #CRLF$
  Declare.s LTrimChars (source$, charlist$=#WHITESPACE$)
  Declare.s RTrimChars (source$, charlist$=#WHITESPACE$)
  Declare.s TrimChars  (source$, charlist$=#WHITESPACE$)
EndDeclareModule

DeclareModule QueryPhrase
  ; Constants and structures are declare as public
  #MODE_DEFAULT  = 1
  #MODE_OR       = 2
  #MODE_AND      = 3
  #MODE_EXCLUDE  = 4

  #PHRASE        = 0
  #SUBPHRASE     = 1

  Structure CS_QUERY_PHRASE
    Phrase.s
    Mode.i
    Type.i                    ; Phrase or subExpression
  EndStructure

  Structure CS_QUERY_SUBEXPRESSION
    List Phrases.CS_QUERY_PHRASE()
  EndStructure

  Global NewList Phrases.CS_QUERY_PHRASE()
  Global NewList SubExpressions.CS_QUERY_SUBEXPRESSION()
  Global TYPE.i = #PHRASE

  Declare initiateSubExpression()
EndDeclareModule

DeclareModule QueryExpression
  Declare AddQueryPhrase(__phrase.s, __type.i = QueryPhrase::#PHRASE, _Mode.i = QueryPhrase::#MODE_DEFAULT)
  Declare AddPhrase(__input.s, __type.i)
  Declare AddOrPhrase(__input.s, __type.i)
  Declare AddAndPhrase(__input.s, __type.i)
  Declare AddExclusionPhrase(__input.s, __type.i)
EndDeclareModule

DeclareModule QueryLexer
  ; Global declare as public
  Global.s NewList TokenStack()
  Global.s NewList ErrorStack()

  Declare   ErrToken(__ErrNum.s)
  Declare   AddToken(__token.s)
  Declare.i Strtok(List __Token.s(), __str.s, __sep.s = " ")
  Declare.s Execute(__str.s)
EndDeclareModule

DeclareModule QueryParser
  Declare.s Parse(__input.s)
  Declare.s ParseQuery(__input.s)
  Declare   ProcessToken(__token.s)
EndDeclareModule

DeclareModule QueryBuilder
  Declare.s GetResult()
  Declare.s FormatString(__Expression.s, __Mode.i)
  Declare   ProcessExpression()
EndDeclareModule

;-Modules creation
Module HandleString
  Procedure.s sPrintf( text.s, *value1 = 0, *value2 = 0, *value3 = 0, *value4 = 0, *value5 = 0, *value6 = 0, *value7 = 0, *value8 = 0, *value9 = 0, *value10 = 0, *value11 = 0 )
    Protected args, *value
    Protected result.s, help.s
    Protected *text.character
    Protected IsValue, IsString, IsLeft, IsVZ, IsNum2, SetFill.s, num1, num2
   
    args  = @*value1
    *text = @text
   
    Repeat
      Select *text\c
        Case 0
          Break
        Case '\'
          *text + SizeOf ( character )
          Select *text\c
            Case 0   : Break
            Case '\' : result + "\"
            Case 'n' : result + #LF$
            Case 'r' : result + #CR$
            Case 't' : result + #HT$
            Case 'v' : result + #VT$
            Case 39  : result + #DQUOTE$ ; (')
            Case 'a' : result + #BEL$
            Case 'b' : result + #BS$
            Case 'f' : result + #FF$
            Case '[' : result + #ESC$
            Case '0' : result + #NULL$
          EndSelect
          *text + SizeOf ( character )

        Case '%'
          help     = "?"
          IsValue = #False
          IsString = #False
          IsLeft = #False
          IsVZ   = #False
          IsNum2 = #False
          SetFill = " "
          num1    = 0
          num2    = 0
          *text   + SizeOf ( character )
          *value  = PeekI  ( args      ) ; get pointer to value
         
          Repeat
            Select *text\c
              Case 0   : Break
              Case '-' : IsLeft = #True
              Case '+' : IsVZ   = #True
              Case '.' : IsNum2 = #True
              Case '%' : result + "%" : *text + SizeOf ( character ) : Break
              Case 39  : *text + SizeOf ( character ) : If *text\c = 0 : Break : Else : SetFill = Chr(*text\c) : EndIf
              Case '0' To '9'
                If IsNum2 : num2 = num2 * 10 + *text\c - 48 : Else : num1 = num1 * 10 + *text\c - 48 : EndIf
              Case 'b'
                If *value : help = Str ( PeekB ( *value ) ) : EndIf : IsValue = #True
              Case 'w'
                If *value : help = Str ( PeekW ( *value ) ) : EndIf : IsValue = #True
              Case 'l'
                If *value : help = Str ( PeekL ( *value ) ) : EndIf : IsValue = #True
              Case 'q'
                If *value : help = Str ( PeekQ ( *value ) ) : EndIf : IsValue = #True
              Case 'i'
                If *value : help = Str ( PeekI ( *value ) ) : EndIf : IsValue = #True
              Case 'f'
                If *value : help = StrF  ( PeekF ( *value ), num2 ) : EndIf : IsValue = #True
              Case 'd'
                If *value : help = StrD  ( PeekD ( *value ), num2 ) : EndIf : IsValue = #True
              Case 's'
                If *value : help = PeekS ( *value     ) : EndIf
                If num2   : help = Left  ( help, num2 ) : EndIf : IsString = #True
              Case 'c'
                If *value : help = Chr ( PeekI ( *value ) ) : EndIf : IsString = #True
              Case 'X', 'x'
                If num2 = 0 : num2 = num1 : EndIf
                If *value
                  Select num2
                    Case 0 To 2  : help = RSet ( Hex ( PeekB ( *value ), #PB_Byte), num2, "0" )
                    Case 3 To 4  : help = RSet ( Hex ( PeekW ( *value ), #PB_Word), num2, "0" )
                    Case 5 To 8  : help = RSet ( Hex ( PeekL ( *value ), #PB_Long), num2, "0" )
                    Default      : help = RSet ( Hex ( PeekQ ( *value ), #PB_Quad), num2, "0" )
                  EndSelect
                EndIf
                If *text\c = 'x' : help = LCase ( help ) : EndIf
                IsString = #True
              Default
                IsString = #True
            EndSelect
           
            If IsValue And IsVZ
              If Left ( help, 1 ) <> "-"
                help = "+" + help
              EndIf
            EndIf
           
            *text + SizeOf(character)
           
            If IsString Or IsValue
              If num1 And Len ( help ) < num1
                If IsLeft
                  result + LSet ( help, num1, SetFill )
                Else
                  result + RSet ( help, num1, SetFill )
                EndIf
              Else
                result + help
              EndIf
              args + SizeOf ( Integer )
              Break
            EndIf
          ForEver
         
        Default
          result + Chr    ( *text\c   )
          *text  + SizeOf ( character )
      EndSelect
    ForEver
   
    ProcedureReturn result
  EndProcedure
 
  Procedure.s LTrimChars (source$, charlist$=#WHITESPACE$)
    ; removes from source$ all leading characters which are contained in charlist$
    Protected p, last=Len(source$)
   
    p = 1
    While p <= last And FindString(charlist$, Mid(source$,p,1)) <> 0
      p + 1
    Wend
   
    ProcedureReturn Mid(source$, p)
  EndProcedure
 
  Procedure.s RTrimChars (source$, charlist$=#WHITESPACE$)
    ; removes from source$ all trailing characters which are contained in charlist$
    Protected p
   
    p = Len(source$)
    While p >= 1 And FindString(charlist$, Mid(source$,p,1)) <> 0
      p - 1
    Wend
   
    ProcedureReturn Left(source$, p)
  EndProcedure
 
  Procedure.s TrimChars(source$, charlist$=#WHITESPACE$)
    ProcedureReturn LtrimChars(RtrimChars(source$, charlist$), charlist$)
  EndProcedure
EndModule

Module QueryPhrase
  Procedure initiateSubExpression()
    AddElement(SubExpressions())
  EndProcedure
EndModule

Module QueryExpression
  Procedure AddQueryPhrase(__phrase.s, __type.i = QueryPhrase::#PHRASE, __Mode.i = QueryPhrase::#MODE_DEFAULT)
    Select __type
      Case QueryPhrase::#PHRASE
        AddElement(QueryPhrase::Phrases())
          QueryPhrase::Phrases()\Phrase = __phrase
          QueryPhrase::Phrases()\Mode   = __Mode
          QueryPhrase::Phrases()\Type   = __type
      Case QueryPhrase::#SUBPHRASE
        AddElement(QueryPhrase::SubExpressions()\Phrases())
          QueryPhrase::SubExpressions()\Phrases()\Phrase = __phrase
          QueryPhrase::SubExpressions()\Phrases()\Mode   = __Mode
          QueryPhrase::SubExpressions()\Phrases()\Type   = __type
    EndSelect
  EndProcedure

  Procedure AddPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type)
  EndProcedure
 
  Procedure AddOrPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type, QueryPhrase::#MODE_OR)
  EndProcedure
 
  Procedure AddAndPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type, QueryPhrase::#MODE_AND)
  EndProcedure
 
  Procedure AddExclusionPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type, QueryPhrase::#MODE_EXCLUDE)
  EndProcedure
EndModule

Module QueryLexer
  Procedure ErrToken(__ErrNum.s)
    AddElement(ErrorStack())
    ErrorStack() = __ErrNum
  EndProcedure
 
  Procedure AddToken(__token.s)
    AddElement(TokenStack())
    TokenStack() = __token
  EndProcedure
 
  Procedure.i Strtok(List __Token.s(), __str.s, __sep.s = " ")
    Protected.i _strCount = CountString(__str, __sep)
    Protected.i _index
   
    For _index = 1 To _strCount + 1
      AddElement(__Token())
        __Token() = StringField(__str, _index, __sep)
    Next
    ProcedureReturn _strCount + 1
  EndProcedure
 
  Procedure.s Execute(__str.s)
    Protected.i _TokenCount, _index, _posQuote
    Protected.i _DQUOTE, _PARENTHESE, _POS_QUOTE_START
    Protected.s _OneCar = ""
    Protected.s _searchString = ""
    Protected.s _LogicalToken, _keyToken, _KeepToken, _KeepTokenTmp, _Token, _TokenTmp, _FullToken
    Protected.s NewList _token()
   
    ; split each token in a Array
    _searchString = LCase(__str)
    _TokenCount   = Strtok(_token(), _searchString, " ")

    If _TokenCount > 0
      ; Get each token
      _DQUOTE           = 0
      _PARENTHESE       = 0
      ForEach _token()
        _TokenTmp         = ""
        _FullToken        = ""
        _index            = 1
        _POS_QUOTE_START  = 0       ; Case where we are on another token and quotes are already open
                                    ; so when we extract the whole token With _POS_QUOTE_START + 1,
                                    ; we are sure to start at 0 (cf line Case '"' when _DQUOTE > 0)
       
        ; Check for Or, And, Not, |, &, -
        _keyToken = _token()
       
        Select _keyToken
          Case "or"
            _LogicalToken = "or "
            _keyToken = ""
           
          Case "and"
            _LogicalToken = "and "
            _keyToken = ""
           
          Case "not"
            _LogicalToken = "not "
            _keyToken = ""
           
          Case "&"
            _LogicalToken = "and "
            _keyToken = ""
           
          Case "|"
            _LogicalToken = "or "
            _keyToken = ""
           
          Case "-"
            _LogicalToken = "not "
            _keyToken = ""
        EndSelect
       
        While _index <= Len(_keyToken)
          _OneCar = Mid(_keyToken, _index, 1)
          Select _OneCar
            Case "("
              AddToken("(")
              _TokenTmp = ""
              _PARENTHESE + 1
            Case ")"
              If _TokenTmp <> ""
                If _LogicalToken <> ""
                  AddToken(_LogicalToken + Trim(_TokenTmp))
                  _LogicalToken = ""
                Else
                  AddToken(Trim(_TokenTmp))
                EndIf
              EndIf
              _TokenTmp = ""
              AddToken(")")
              _PARENTHESE - 1
            Case "'"
              _index + 1
            Case #DQUOTE$
              If _DQUOTE > 0              ; Means the checked value has already an open quote, so we can add the token
                _Token          = Mid(_keyToken, _POS_QUOTE_START + 1, _index - _POS_QUOTE_START - 1)
                If _KeepToken <> ""
                  _FullToken = Trim(_KeepToken) + "$[0]" + Trim(_token)      ; or " #0 "
                  If _LogicalToken <> ""
                    _FullToken = _LogicalToken + _FullToken
                  EndIf
                  AddToken(Trim(_FullToken))
                  _FullToken    = ""
                  _LogicalToken = ""
                  _KeepToken = ""
                Else
                  If _LogicalToken <> ""
                    AddToken(_LogicalToken + Trim(_token))
                    _LogicalToken = ""
                  Else
                    AddToken(Trim(_token))     ; We add the only one token we found between double quotes
                  EndIf
                EndIf
                _DQUOTE - 1
              Else
                ; Case it's a new quote, we must keep the token to be add with the one we will find
                ; when the quote will be closed
                ; check If we have another quote after this one
                _POS_QUOTE_START = _index
                _DQUOTE + 1
                ; If no other Quote, we keep it to be added after
                If FindString(_keyToken, #DQUOTE$, _index + 1) = 0
                  ; No so we keep it
                  _KeepToken + Mid(_keyToken, _index + 1)
                EndIf
              EndIf
              _TokenTmp = ""
            Default
              _TokenTmp + _OneCar
          EndSelect
          _index+1
        Wend
       
        ; We are in the case of a new token appears and the previous had a quote
        If _DQUOTE > 0
          _posQuote = FindString(_keyToken, #DQUOTE$, 1)   ; Check to be sure there is no other quote
          If _posQuote = 0                                 ; No so we keep it
            _KeepTokenTmp = _TokenTmp
            _KeepToken + "$[0]" + _KeepTokenTmp            ; or " #0 "
          EndIf
        Else
          If _TokenTmp <> ""
            If _LogicalToken <> ""
              AddToken(_LogicalToken + Trim(_TokenTmp))
              _LogicalToken = ""
            Else
              _TokenTmp = Trim(_TokenTmp)
              ; We start With the first token And there is no quote, nor parenthese, just a word
              If ListSize(tokenStack()) = 0
                ; add parentheses To be sure To have the 'and' between each word
                If (Left(_TokenTmp, 1) = "-")
                  AddToken("not " + Mid(_TokenTmp, 2))
                Else
                  AddToken("(")
                  AddToken(_TokenTmp)
                  AddToken(")")
                EndIf
              Else ; Not empty
                If Left(_TokenTmp, 1) = "-"        ; If left of token = '-' don't add the 'And'
                  AddToken("not " + Mid(_TokenTmp, 2))
                  ;addToken(_TokenTmp)
                Else
                  AddToken("and " + _TokenTmp)    ;  automatic 'and' between all
                EndIf
              EndIf
            EndIf
          EndIf
        EndIf
      Next    ; End ForEach
     
      If _DQUOTE     <> 0 : ErrToken("#ERR001") : EndIf    ; ("Error double quotes does not match");
      If _PARENTHESE <> 0 : ErrToken("#ERR002") : EndIf    ; ("Error parentheses does not match");
    Else
      ProcedureReturn ""
    EndIf
  EndProcedure
EndModule

Module QueryParser
  Procedure.s Parse(__input.s)
    Protected.s _expression
   
    QueryLexer::Execute(__input)

    ForEach QueryLexer::TokenStack()
      ProcessToken(QueryLexer::TokenStack())
    Next
  EndProcedure
 
  Procedure.s ParseQuery(__input.s)
    ; Clear each time
    ClearList(QueryLexer::TokenStack())
    ClearList(QueryLexer::ErrorStack())
    ClearList(QueryPhrase::Phrases())
    ClearList(QueryPhrase::SubExpressions())

    QueryParser::Parse(__input)
    QueryBuilder::ProcessExpression()

    ProcedureReturn QueryBuilder::GetResult()
  EndProcedure
 
  Procedure ProcessToken(__token.s)
    Protected.s _Modifier, _Phrase, _Operator
    Protected.i _type = QueryPhrase::TYPE

    Select __token
      Case "("
        QueryPhrase::TYPE = QueryPhrase::#SUBPHRASE
        QueryPhrase::initiateSubExpression()
      Case ")"
        QueryPhrase::TYPE = QueryPhrase::#PHRASE
      Default
        _Modifier = Left(__token, 1)
        _Phrase   = Mid(__token, 2)
        Select _Modifier
          Case "-"
            QueryExpression::AddExclusionPhrase(_Phrase, _type)
          Case "+"
            QueryExpression::AddPhrase(_Phrase, _type)
          Default
            _Operator = Trim(Mid(__token, 1, FindString(__token, " ")))
            _Phrase   = Trim(Mid(__token, FindString(__token, " ")))
            Select LCase(_Operator)
              Case "and"
                QueryExpression::AddAndPhrase(_Phrase, _type)
              Case "or"
                QueryExpression::AddOrPhrase(_Phrase, _type)
              Case "not"
                QueryExpression::AddExclusionPhrase(_Phrase, _type)
              Default:
                QueryExpression::AddPhrase(__token, _type)
            EndSelect
        EndSelect
    EndSelect
   
  EndProcedure
EndModule

Module QueryBuilder
  ; Private var
  Global.s _Query
 
  Procedure.s GetResult()
    ProcedureReturn _Query
  EndProcedure

  Procedure.s FormatString(__Expression.s, __Mode.i)
    Protected.s _Expression = HandleString::TrimChars(__Expression, " &|")
    Protected.s _Format     = "''%s'' "
   
    Select __Mode
      Case QueryPhrase::#MODE_AND
        _Format = "& ''%s'' "
      Case QueryPhrase::#MODE_OR
        _Format = "| ''%s'' "
      Case QueryPhrase::#MODE_EXCLUDE
        _Format = "& !''%s'' "
    EndSelect
    _Expression = ReplaceString(_Expression, "'", "\\'")
    ProcedureReturn HandleString::sPrintf(_Format, @_Expression)
  EndProcedure

  Procedure ProcessExpression()
    Protected.s _Expression, _Format, _QueryTMP

    _Query = ""

    ForEach QueryPhrase::Phrases()
      If QueryPhrase::Phrases()\type = QueryPhrase::#PHRASE
        _Query + FormatString(QueryPhrase::Phrases()\Phrase, QueryPhrase::Phrases()\Mode)
      EndIf
    Next
   
    ForEach QueryPhrase::SubExpressions()
      _QueryTMP = ""
      ForEach QueryPhrase::SubExpressions()\Phrases()
        _QueryTMP + FormatString(QueryPhrase::SubExpressions()\Phrases()\Phrase, QueryPhrase::SubExpressions()\Phrases()\Mode)
      Next
      _Query + "& (" + _QueryTMP + ") "
    Next

    _Query = HandleString::TrimChars(_Query, " &|")
  EndProcedure
EndModule

;-End modules creation
Test :

Code: Select all

XIncludeFile "PROCEDURES_ParseQuery.pbi"

Procedure showQuery(__query.s)
  Debug "Result for Query : "+__query+ #CRLF$ + QueryParser::ParseQuery(__query)
  Debug "-------------------------------------------------"
EndProcedure

str1$ = "("+#DQUOTE$+"sales director"+#DQUOTE$+" Or "+#DQUOTE$+"sale"+#DQUOTE$+") "+#DQUOTE$+"team leader"+#DQUOTE$+" ("+#DQUOTE$+"ho chi minh"+#DQUOTE$+" And male) -executive -fmcg Not representative"
str2$ = "java "+#DQUOTE$+"technical architect"+#DQUOTE$
str3$ = #DQUOTE$+"qa manager"+#DQUOTE$+" -fmcg"
str4$ = #DQUOTE$+"qa manager"+#DQUOTE$+" Or -fmcg"
str5$ = "GENERAL MANAGER"
str6$ = "manager java developer"
str7$ = "-fmcg"
str8$ = "java -fmcg"
str9$ = "("+#DQUOTE$+"used cars"+#DQUOTE$+" Or "+#DQUOTE$+"new cars"+#DQUOTE$+") -ford -mistubishi"

; --- Result for str1$ must be :
; ''team #0 leader'' & !''executive'' & !''fmcg'' & !''representative'' & (''sales #0 director'' | ''sale'' ) & (''ho #0 chi #0 minh'' & ''male'' )
; --- (27/11/2013) result :
; ''team$[0]leader'' & !''executive'' & !''fmcg'' & !''representative'' & (''sales$[0]director'' | ''sale'' ) & (''ho$[0]chi$[0]minh'' & ''male'' )
showQuery(str1$)
showQuery(str2$)
showQuery(str3$)
showQuery(str4$)
showQuery(str5$)
showQuery(str6$)
showQuery(str7$)
showQuery(str8$)
showQuery(str9$)
I have the complete PHP code also, I made the Purebasic version from the PHP :wink:
“Fear is a reaction. Courage is a decision.” - WC