Conversion requete de recherche vers PostgreSQL "to_tsquery"

Partagez votre expérience de PureBasic avec les autres utilisateurs.
Avatar de l’utilisateur
flaith
Messages : 1487
Inscription : jeu. 07/avr./2005 1:06
Localisation : Rennes
Contact :

Conversion requete de recherche vers PostgreSQL "to_tsquery"

Message par flaith »

Hello :)

PostgreSQL permet de faire des recherches précises dans un texte en utilisant "to_tsquery", dans mon cas sur des CV, donc ayant besoin, pour ce projet actuel, de faire des recherches dans une base de données pour un website, j'ai été inspiré par ce code (Converting a google search query to PostgreSQL tsquery), Il manquait la partie générant les lexers, je l'ai donc rajouté dans mon code PHP, et pour les besoins d'administration de la base de données, j'ai fait un programme en Purebasic, que voici :

Code : Tout sélectionner

; Program       : PROCEDURES_ParseQuery.pbi
; Comment       : Generates PostgreSql to_tsquery
; Author        : Flaith
; Create        : 24.11.2013
; Update        : 27.11.2013
; Inspired of the PHP code from Peter Bailey
; http://stackoverflow.com/questions/207817/converting-a-google-search-query-to-a-postgresql-tsquery#tab-top

;-Modules declaration
DeclareModule HandleString
  ; sPrintf
  ; =======
  ; From mk-soft
  ; http://www.purebasic.fr/english/viewtopic.php?p=241016
  Declare.s sPrintf(text.s, *value1 = 0, *value2 = 0, *value3 = 0, *value4 = 0, *value5 = 0, *value6 = 0, *value7 = 0, *value8 = 0, *value9 = 0, *value10 = 0, *value11 = 0 )

  ; L/R/TrimChars
  ; =============
  ; From Little John
  ; http://www.purebasic.fr/english/viewtopic.php?p=411500#p411500
  #WHITESPACE$ = " " + #TAB$ + #CRLF$
  Declare.s LTrimChars (source$, charlist$=#WHITESPACE$)
  Declare.s RTrimChars (source$, charlist$=#WHITESPACE$)
  Declare.s TrimChars  (source$, charlist$=#WHITESPACE$)
EndDeclareModule

DeclareModule QueryPhrase
  ; Constants and structures are declare as public
  #MODE_DEFAULT  = 1
  #MODE_OR       = 2
  #MODE_AND      = 3
  #MODE_EXCLUDE  = 4

  #PHRASE        = 0
  #SUBPHRASE     = 1

  Structure CS_QUERY_PHRASE
    Phrase.s
    Mode.i
    Type.i                    ; Phrase or subExpression
  EndStructure

  Structure CS_QUERY_SUBEXPRESSION
    List Phrases.CS_QUERY_PHRASE()
  EndStructure

  Global NewList Phrases.CS_QUERY_PHRASE()
  Global NewList SubExpressions.CS_QUERY_SUBEXPRESSION()
  Global TYPE.i = #PHRASE

  Declare initiateSubExpression()
EndDeclareModule

DeclareModule QueryExpression
  Declare AddQueryPhrase(__phrase.s, __type.i = QueryPhrase::#PHRASE, _Mode.i = QueryPhrase::#MODE_DEFAULT)
  Declare AddPhrase(__input.s, __type.i)
  Declare AddOrPhrase(__input.s, __type.i)
  Declare AddAndPhrase(__input.s, __type.i)
  Declare AddExclusionPhrase(__input.s, __type.i)
EndDeclareModule

DeclareModule QueryLexer
  ; Global declare as public
  Global.s NewList TokenStack()
  Global.s NewList ErrorStack()

  Declare   ErrToken(__ErrNum.s)
  Declare   AddToken(__token.s)
  Declare.i Strtok(List __Token.s(), __str.s, __sep.s = " ")
  Declare.s Execute(__str.s)
EndDeclareModule

DeclareModule QueryParser
  Declare.s Parse(__input.s)
  Declare.s ParseQuery(__input.s)
  Declare   ProcessToken(__token.s)
EndDeclareModule

DeclareModule QueryBuilder
  Declare.s GetResult()
  Declare.s FormatString(__Expression.s, __Mode.i)
  Declare   ProcessExpression()
EndDeclareModule

;-Modules creation
Module HandleString
  Procedure.s sPrintf( text.s, *value1 = 0, *value2 = 0, *value3 = 0, *value4 = 0, *value5 = 0, *value6 = 0, *value7 = 0, *value8 = 0, *value9 = 0, *value10 = 0, *value11 = 0 )
    Protected args, *value
    Protected result.s, help.s
    Protected *text.character
    Protected IsValue, IsString, IsLeft, IsVZ, IsNum2, SetFill.s, num1, num2
    
    args  = @*value1
    *text = @text
    
    Repeat
      Select *text\c
        Case 0
          Break
        Case '\'
          *text + SizeOf ( character )
          Select *text\c
            Case 0   : Break
            Case '\' : result + "\"
            Case 'n' : result + #LF$
            Case 'r' : result + #CR$
            Case 't' : result + #HT$
            Case 'v' : result + #VT$
            Case 39  : result + #DQUOTE$ ; (')
            Case 'a' : result + #BEL$
            Case 'b' : result + #BS$
            Case 'f' : result + #FF$
            Case '[' : result + #ESC$
            Case '0' : result + #NULL$
          EndSelect
          *text + SizeOf ( character )

        Case '%'
          help     = "?"
          IsValue = #False
          IsString = #False
          IsLeft = #False
          IsVZ   = #False
          IsNum2 = #False
          SetFill = " "
          num1    = 0
          num2    = 0
          *text   + SizeOf ( character )
          *value  = PeekI  ( args      ) ; get pointer to value
          
          Repeat
            Select *text\c
              Case 0   : Break
              Case '-' : IsLeft = #True
              Case '+' : IsVZ   = #True
              Case '.' : IsNum2 = #True
              Case '%' : result + "%" : *text + SizeOf ( character ) : Break
              Case 39  : *text + SizeOf ( character ) : If *text\c = 0 : Break : Else : SetFill = Chr(*text\c) : EndIf
              Case '0' To '9'
                If IsNum2 : num2 = num2 * 10 + *text\c - 48 : Else : num1 = num1 * 10 + *text\c - 48 : EndIf
              Case 'b'
                If *value : help = Str ( PeekB ( *value ) ) : EndIf : IsValue = #True
              Case 'w'
                If *value : help = Str ( PeekW ( *value ) ) : EndIf : IsValue = #True
              Case 'l'
                If *value : help = Str ( PeekL ( *value ) ) : EndIf : IsValue = #True
              Case 'q'
                If *value : help = Str ( PeekQ ( *value ) ) : EndIf : IsValue = #True
              Case 'i'
                If *value : help = Str ( PeekI ( *value ) ) : EndIf : IsValue = #True
              Case 'f'
                If *value : help = StrF  ( PeekF ( *value ), num2 ) : EndIf : IsValue = #True
              Case 'd'
                If *value : help = StrD  ( PeekD ( *value ), num2 ) : EndIf : IsValue = #True
              Case 's'
                If *value : help = PeekS ( *value     ) : EndIf
                If num2   : help = Left  ( help, num2 ) : EndIf : IsString = #True
              Case 'c'
                If *value : help = Chr ( PeekI ( *value ) ) : EndIf : IsString = #True
              Case 'X', 'x'
                If num2 = 0 : num2 = num1 : EndIf
                If *value
                  Select num2
                    Case 0 To 2  : help = RSet ( Hex ( PeekB ( *value ), #PB_Byte), num2, "0" )
                    Case 3 To 4  : help = RSet ( Hex ( PeekW ( *value ), #PB_Word), num2, "0" )
                    Case 5 To 8  : help = RSet ( Hex ( PeekL ( *value ), #PB_Long), num2, "0" )
                    Default      : help = RSet ( Hex ( PeekQ ( *value ), #PB_Quad), num2, "0" )
                  EndSelect
                EndIf
                If *text\c = 'x' : help = LCase ( help ) : EndIf
                IsString = #True
              Default
                IsString = #True
            EndSelect
            
            If IsValue And IsVZ
              If Left ( help, 1 ) <> "-"
                help = "+" + help
              EndIf
            EndIf
            
            *text + SizeOf(character)
            
            If IsString Or IsValue
              If num1 And Len ( help ) < num1
                If IsLeft
                  result + LSet ( help, num1, SetFill )
                Else
                  result + RSet ( help, num1, SetFill )
                EndIf
              Else
                result + help
              EndIf
              args + SizeOf ( Integer )
              Break
            EndIf
          ForEver
          
        Default
          result + Chr    ( *text\c   )
          *text  + SizeOf ( character )
      EndSelect
    ForEver
    
    ProcedureReturn result
  EndProcedure
  
  Procedure.s LTrimChars (source$, charlist$=#WHITESPACE$)
    ; removes from source$ all leading characters which are contained in charlist$
    Protected p, last=Len(source$)
    
    p = 1
    While p <= last And FindString(charlist$, Mid(source$,p,1)) <> 0
      p + 1
    Wend
    
    ProcedureReturn Mid(source$, p)
  EndProcedure
  
  Procedure.s RTrimChars (source$, charlist$=#WHITESPACE$)
    ; removes from source$ all trailing characters which are contained in charlist$
    Protected p
    
    p = Len(source$)
    While p >= 1 And FindString(charlist$, Mid(source$,p,1)) <> 0
      p - 1
    Wend
    
    ProcedureReturn Left(source$, p)
  EndProcedure
  
  Procedure.s TrimChars(source$, charlist$=#WHITESPACE$)
    ProcedureReturn LtrimChars(RtrimChars(source$, charlist$), charlist$)
  EndProcedure
EndModule

Module QueryPhrase
  Procedure initiateSubExpression()
    AddElement(SubExpressions())
  EndProcedure
EndModule

Module QueryExpression
  Procedure AddQueryPhrase(__phrase.s, __type.i = QueryPhrase::#PHRASE, __Mode.i = QueryPhrase::#MODE_DEFAULT)
    Select __type
      Case QueryPhrase::#PHRASE
        AddElement(QueryPhrase::Phrases())
          QueryPhrase::Phrases()\Phrase = __phrase
          QueryPhrase::Phrases()\Mode   = __Mode
          QueryPhrase::Phrases()\Type   = __type
      Case QueryPhrase::#SUBPHRASE
        AddElement(QueryPhrase::SubExpressions()\Phrases())
          QueryPhrase::SubExpressions()\Phrases()\Phrase = __phrase
          QueryPhrase::SubExpressions()\Phrases()\Mode   = __Mode
          QueryPhrase::SubExpressions()\Phrases()\Type   = __type
    EndSelect
  EndProcedure

  Procedure AddPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type)
  EndProcedure
  
  Procedure AddOrPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type, QueryPhrase::#MODE_OR)
  EndProcedure
  
  Procedure AddAndPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type, QueryPhrase::#MODE_AND)
  EndProcedure
  
  Procedure AddExclusionPhrase(__input.s, __type.i)
    AddQueryPhrase(__input, __type, QueryPhrase::#MODE_EXCLUDE)
  EndProcedure
EndModule

Module QueryLexer
  Procedure ErrToken(__ErrNum.s)
    AddElement(ErrorStack())
    ErrorStack() = __ErrNum
  EndProcedure
  
  Procedure AddToken(__token.s)
    AddElement(TokenStack())
    TokenStack() = __token
  EndProcedure
  
  Procedure.i Strtok(List __Token.s(), __str.s, __sep.s = " ")
    Protected.i _strCount = CountString(__str, __sep)
    Protected.i _index
    
    For _index = 1 To _strCount + 1
      AddElement(__Token())
        __Token() = StringField(__str, _index, __sep)
    Next
    ProcedureReturn _strCount + 1
  EndProcedure
  
  Procedure.s Execute(__str.s)
    Protected.i _TokenCount, _index, _posQuote
    Protected.i _DQUOTE, _PARENTHESE, _POS_QUOTE_START
    Protected.s _OneCar = ""
    Protected.s _searchString = ""
    Protected.s _LogicalToken, _keyToken, _KeepToken, _KeepTokenTmp, _Token, _TokenTmp, _FullToken
    Protected.s NewList _token()
    
    ; split each token in a Array
    _searchString = LCase(__str)
    _TokenCount   = Strtok(_token(), _searchString, " ")

    If _TokenCount > 0
      ; Get each token
      _DQUOTE           = 0
      _PARENTHESE       = 0
      ForEach _token()
        _TokenTmp         = ""
        _FullToken        = ""
        _index            = 1
        _POS_QUOTE_START  = 0       ; Case where we are on another token and quotes are already open
                                    ; so when we extract the whole token With _POS_QUOTE_START + 1,
                                    ; we are sure to start at 0 (cf line Case '"' when _DQUOTE > 0)
        
        ; Check for Or, And, Not, |, &, -
        _keyToken = _token()
        
        Select _keyToken
          Case "or"
            _LogicalToken = "or "
            _keyToken = ""
            
          Case "and"
            _LogicalToken = "and "
            _keyToken = ""
            
          Case "not"
            _LogicalToken = "not "
            _keyToken = ""
            
          Case "&"
            _LogicalToken = "and "
            _keyToken = ""
            
          Case "|"
            _LogicalToken = "or "
            _keyToken = ""
            
          Case "-"
            _LogicalToken = "not "
            _keyToken = ""
        EndSelect
        
        While _index <= Len(_keyToken)
          _OneCar = Mid(_keyToken, _index, 1)
          Select _OneCar
            Case "("
              AddToken("(")
              _TokenTmp = ""
              _PARENTHESE + 1
            Case ")"
              If _TokenTmp <> ""
                If _LogicalToken <> ""
                  AddToken(_LogicalToken + Trim(_TokenTmp))
                  _LogicalToken = ""
                Else 
                  AddToken(Trim(_TokenTmp))
                EndIf
              EndIf
              _TokenTmp = ""
              AddToken(")")
              _PARENTHESE - 1
            Case "'"
              _index + 1
            Case #DQUOTE$
              If _DQUOTE > 0              ; Means the checked value has already an open quote, so we can add the token
                _Token          = Mid(_keyToken, _POS_QUOTE_START + 1, _index - _POS_QUOTE_START - 1)
                If _KeepToken <> ""
                  _FullToken = Trim(_KeepToken) + "$[0]" + Trim(_token)      ; or " #0 "
                  If _LogicalToken <> ""
                    _FullToken = _LogicalToken + _FullToken
                  EndIf
                  AddToken(Trim(_FullToken))
                  _FullToken    = ""
                  _LogicalToken = ""
                  _KeepToken = ""
                Else
                  If _LogicalToken <> ""
                    AddToken(_LogicalToken + Trim(_token))
                    _LogicalToken = ""
                  Else
                    AddToken(Trim(_token))     ; We add the only one token we found between double quotes
                  EndIf
                EndIf
                _DQUOTE - 1
              Else
                ; Case it's a new quote, we must keep the token to be add with the one we will find
                ; when the quote will be closed
                ; check If we have another quote after this one
                _POS_QUOTE_START = _index
                _DQUOTE + 1
                ; If no other Quote, we keep it to be added after
                If FindString(_keyToken, #DQUOTE$, _index + 1) = 0
                  ; No so we keep it
                  _KeepToken + Mid(_keyToken, _index + 1)
                EndIf
              EndIf
              _TokenTmp = ""
            Default
              _TokenTmp + _OneCar
          EndSelect
          _index+1
        Wend
        
        ; We are in the case of a new token appears and the previous had a quote
        If _DQUOTE > 0
          _posQuote = FindString(_keyToken, #DQUOTE$, 1)   ; Check to be sure there is no other quote
          If _posQuote = 0                                 ; No so we keep it
            _KeepTokenTmp = _TokenTmp
            _KeepToken + "$[0]" + _KeepTokenTmp            ; or " #0 "
          EndIf
        Else
          If _TokenTmp <> ""
            If _LogicalToken <> ""
              AddToken(_LogicalToken + Trim(_TokenTmp))
              _LogicalToken = ""
            Else
              _TokenTmp = Trim(_TokenTmp)
              ; We start With the first token And there is no quote, nor parenthese, just a word
              If ListSize(tokenStack()) = 0
                ; add parentheses To be sure To have the 'and' between each word
                If (Left(_TokenTmp, 1) = "-")
                  AddToken("not " + Mid(_TokenTmp, 2))
                Else
                  AddToken("(")
                  AddToken(_TokenTmp)
                  AddToken(")")
                EndIf
              Else ; Not empty
                If Left(_TokenTmp, 1) = "-"        ; If left of token = '-' don't add the 'And'
                  AddToken("not " + Mid(_TokenTmp, 2))
                  ;addToken(_TokenTmp)
                Else
                  AddToken("and " + _TokenTmp)    ;  automatic 'and' between all
                EndIf
              EndIf
            EndIf
          EndIf
        EndIf
      Next    ; End ForEach
      
      If _DQUOTE     <> 0 : ErrToken("#ERR001") : EndIf    ; ("Error double quotes does not match");
      If _PARENTHESE <> 0 : ErrToken("#ERR002") : EndIf    ; ("Error parentheses does not match");
    Else
      ProcedureReturn ""
    EndIf
  EndProcedure
EndModule

Module QueryParser
  Procedure.s Parse(__input.s)
    Protected.s _expression
    
    QueryLexer::Execute(__input)

    ForEach QueryLexer::TokenStack()
      ProcessToken(QueryLexer::TokenStack())
    Next
  EndProcedure
  
  Procedure.s ParseQuery(__input.s)
    ; Clear each time
    ClearList(QueryLexer::TokenStack())
    ClearList(QueryLexer::ErrorStack())
    ClearList(QueryPhrase::Phrases())
    ClearList(QueryPhrase::SubExpressions())

    QueryParser::Parse(__input)
    QueryBuilder::ProcessExpression()

    ProcedureReturn QueryBuilder::GetResult()
  EndProcedure
  
  Procedure ProcessToken(__token.s)
    Protected.s _Modifier, _Phrase, _Operator
    Protected.i _type = QueryPhrase::TYPE

    Select __token
      Case "("
        QueryPhrase::TYPE = QueryPhrase::#SUBPHRASE
        QueryPhrase::initiateSubExpression()
      Case ")"
        QueryPhrase::TYPE = QueryPhrase::#PHRASE
      Default
        _Modifier = Left(__token, 1)
        _Phrase   = Mid(__token, 2)
        Select _Modifier
          Case "-"
            QueryExpression::AddExclusionPhrase(_Phrase, _type)
          Case "+"
            QueryExpression::AddPhrase(_Phrase, _type)
          Default
            _Operator = Trim(Mid(__token, 1, FindString(__token, " ")))
            _Phrase   = Trim(Mid(__token, FindString(__token, " ")))
            Select LCase(_Operator)
              Case "and"
                QueryExpression::AddAndPhrase(_Phrase, _type)
              Case "or"
                QueryExpression::AddOrPhrase(_Phrase, _type)
              Case "not"
                QueryExpression::AddExclusionPhrase(_Phrase, _type)
              Default:
                QueryExpression::AddPhrase(__token, _type)
            EndSelect
        EndSelect
    EndSelect
    
  EndProcedure
EndModule

Module QueryBuilder
  ; Private var
  Global.s _Query
  
  Procedure.s GetResult()
    ProcedureReturn _Query
  EndProcedure

  Procedure.s FormatString(__Expression.s, __Mode.i)
    Protected.s _Expression = HandleString::TrimChars(__Expression, " &|")
    Protected.s _Format     = "''%s'' "
    
    Select __Mode
      Case QueryPhrase::#MODE_AND
        _Format = "& ''%s'' "
      Case QueryPhrase::#MODE_OR
        _Format = "| ''%s'' "
      Case QueryPhrase::#MODE_EXCLUDE
        _Format = "& !''%s'' "
    EndSelect
    _Expression = ReplaceString(_Expression, "'", "\\'")
    ProcedureReturn HandleString::sPrintf(_Format, @_Expression)
  EndProcedure

  Procedure ProcessExpression()
    Protected.s _Expression, _Format, _QueryTMP

    _Query = ""

    ForEach QueryPhrase::Phrases()
      If QueryPhrase::Phrases()\type = QueryPhrase::#PHRASE
        _Query + FormatString(QueryPhrase::Phrases()\Phrase, QueryPhrase::Phrases()\Mode)
      EndIf
    Next
    
    ForEach QueryPhrase::SubExpressions()
      _QueryTMP = ""
      ForEach QueryPhrase::SubExpressions()\Phrases()
        _QueryTMP + FormatString(QueryPhrase::SubExpressions()\Phrases()\Phrase, QueryPhrase::SubExpressions()\Phrases()\Mode)
      Next
      _Query + "& (" + _QueryTMP + ") "
    Next

    _Query = HandleString::TrimChars(_Query, " &|")
  EndProcedure
EndModule

;-End modules creation
Programme test :

Code : Tout sélectionner

XIncludeFile "PROCEDURES_ParseQuery.pbi"

Procedure showQuery(__query.s)
  Debug "Result for Query : "+__query+ #CRLF$ + QueryParser::ParseQuery(__query)
  Debug "-------------------------------------------------"
EndProcedure

str1$ = "("+#DQUOTE$+"sales director"+#DQUOTE$+" Or "+#DQUOTE$+"sale"+#DQUOTE$+") "+#DQUOTE$+"team leader"+#DQUOTE$+" ("+#DQUOTE$+"ho chi minh"+#DQUOTE$+" And male) -executive -fmcg Not representative"
str2$ = "java "+#DQUOTE$+"technical architect"+#DQUOTE$
str3$ = #DQUOTE$+"qa manager"+#DQUOTE$+" -fmcg"
str4$ = #DQUOTE$+"qa manager"+#DQUOTE$+" Or -fmcg"
str5$ = "GENERAL MANAGER"
str6$ = "manager java developer"
str7$ = "-fmcg"
str8$ = "java -fmcg"
str9$ = "("+#DQUOTE$+"used cars"+#DQUOTE$+" Or "+#DQUOTE$+"new cars"+#DQUOTE$+") -ford -mistubishi"

; --- Result for str1$ must be :
; ''team #0 leader'' & !''executive'' & !''fmcg'' & !''representative'' & (''sales #0 director'' | ''sale'' ) & (''ho #0 chi #0 minh'' & ''male'' )
; --- (27/11/2013) result :
; ''team$[0]leader'' & !''executive'' & !''fmcg'' & !''representative'' & (''sales$[0]director'' | ''sale'' ) & (''ho$[0]chi$[0]minh'' & ''male'' )
showQuery(str1$)
showQuery(str2$)
showQuery(str3$)
showQuery(str4$)
showQuery(str5$)
showQuery(str6$)
showQuery(str7$)
showQuery(str8$)
showQuery(str9$)
J'ai le source complet en PHP car j'ai fait la version Purebasic en fonction de ce code PHP :wink: