python - How can I edit my parser to properly group "AND" and "OR" predicates? -


i trying write small parser able parse simple key = value queries. should smart enough handle and , or groups, and having higher precendence. example text-input:

a = 10 && b = 20 = 10 || b = 20 = 10 && b = 20 || c = 30 

the first 2 trivial. last should group first 2 predicates "and" group, , group should grouped in "or" group.

i have basics down, got stuck on proper grouping. using ply uses flex/bison/lex/yacc syntax define grammar. if i'm totally heading down wrong track existing syntax please let me know... valuable learning experience concerning parsers.

i've tried setting precedence, don't think it's caused reduce/reduce conflict. think it's more of issue of way i've defined grammar in general, can't figure out need change.

below current implementation , unit-test file. test-file should understanding expected output. there's 1 failing test. that's 1 causes me headaches.

the tests can run using builtin unittest module, but, execute print statements in tests, suggest using pytest intercepts , causes less of mess. example (assuming both files in same folder):

python -m venv env ./env/bin/pip install pytest ./env/bin/pytest test_query_string.py 

filename: queryparser.py

import logging collections import namedtuple  import ply.lex lex import ply.yacc yacc   log = logging.getlogger(__name__)  predicate = namedtuple('predicate', 'key operator value')   class production:      def __repr__(self):         preds = [repr(pred) pred in self._predicates]         return '%s(%s)' % (self.__class__.__name__, ', '.join(preds))      def __eq__(self, other):         return (             self.__class__ == other.__class__ ,             self._predicates == other._predicates)      def debug(self, indent=0, aslist=false):         lines = []         lines.append('    ' * indent + self.__class__.__name__)         predicate in self._predicates:             if hasattr(predicate, 'debug'):                 lines.extend(predicate.debug(indent + 1, aslist=true))             else:                 lines.append('    ' * (indent+1) + repr(predicate))         if aslist:             return lines         else:             return '\n'.join(lines)    class conjunction(production):      def __init__(self, *predicates):         self._predicates = predicates   class disjunction(production):      def __init__(self, *predicates):         self._predicates = predicates   def parse(query: str, debug=false) -> predicate:     lexer = querylexer().build()     parser = queryparser().build()     if debug:         output = parser.parse(query, debug=log)     else:         output = parser.parse(query)     return output or []   class querylexer:      tokens = (         'word',         'operator',         'quote',         'and',         'or'     )      t_ignore = ' \t'     t_quote = '"'      def t_error(self, t):         log.warning('illegal character %r', t.value[0])         t.lexer.skip(1)      def t_word(self, t):         r'\w+'         return t      def t_operator(self, t):         r'(=|!=|>|<|<=|>=)'         return t      def t_and(self, t):         r'&&'         return t      def t_or(self, t):         r'\|\|'         return t      def build(self, **kwargs):         self.lexer = lex.lex(module=self, **kwargs)   class queryparser:      precedence = (         ('nonassoc', 'or'),         ('nonassoc', 'and'),     )      def p_query(self, p):         '''         query : disjunction               | conjunction               | predicate         '''         p[0] = p[1]      def p_disjunction(self, p):         '''         disjunction : predicate or predicate                     | predicate or conjunction                     | predicate or disjunction         '''         output = [p[1]]         if p.slice[3].type == 'disjunction':             # can merge multiple chanined disjunctions             output.extend(p[3]._predicates)         else:             output.append(p[3])         p[0] = disjunction(*output)      def p_conjunction(self, p):         '''         conjunction : predicate , predicate                     | predicate , conjunction                     | predicate , disjunction         '''         if len(p) == 4:             output = [p[1]]             if p.slice[3].type == 'conjunction':                 # can merge multiple chanined disjunctions                 output.extend(p[3]._predicates)             else:                 output.append(p[3])             p[0] = conjunction(*output)         else:             p[0] = conjunction(p[1])      def p_predicate(self, p):         '''         predicate : maybequoted operator maybequoted         '''         p[0] = predicate(p[1], p[2], p[3])      def p_maybequoted(self, p):         '''         maybequoted : word                     | quote word quote         '''         if len(p) == 4:             p[0] = p[2]         else:             p[0] = p[1]      def p_error(self, p):         """         panic-mode rule parser errors.         """         if not p:             log.debug('syntax error @ eof')         else:             self.parser.errok()         log.error('syntax error @ %r', p)      def build(self):         self.tokens = querylexer.tokens         self.parser = yacc.yacc(module=self, outputdir='/tmp', debug=true)         return self.parser 

filename: test_query_string.py

import unittest  queryparser import parse, conjunction, disjunction, predicate   class testquerystring(unittest.testcase):      def test_single_equals(self):         result = parse('hostname = foo')         self.assertequal(result, predicate('hostname', '=', 'foo'))      def test_single_equals_quoted(self):         result = parse('hostname = "foo"')         self.assertequal(result, predicate('hostname', '=', 'foo'))      def test_anded_equals(self):         result = parse('hostname = foo && role=cpe')         self.assertequal(result, conjunction(             predicate('hostname', '=', 'foo'),             predicate('role', '=', 'cpe'),         ))      def test_ored_equals(self):         result = parse('hostname = foo || role=cpe')         self.assertequal(result, disjunction(             predicate('hostname', '=', 'foo'),             predicate('role', '=', 'cpe'),         ))      def test_chained_conjunction(self):         result = parse('hostname = foo && role=cpe && bla=blub')         print(result.debug())  # xxx debug statement         self.assertequal(result, conjunction(             predicate('hostname', '=', 'foo'),             predicate('role', '=', 'cpe'),             predicate('bla', '=', 'blub'),         ))      def test_chained_disjunction(self):         result = parse('hostname = foo || role=cpe || bla=blub')         print(result.debug())  # xxx debug statement         self.assertequal(result, disjunction(             predicate('hostname', '=', 'foo'),             predicate('role', '=', 'cpe'),             predicate('bla', '=', 'blub'),         ))      def test_mixed_predicates(self):         result = parse('hostname = foo || role=cpe && bla=blub')         print(result.debug())  # xxx debug statement         self.assertequal(result, disjunction(             predicate('hostname', '=', 'foo'),             conjunction(                 predicate('role', '=', 'cpe'),                 predicate('bla', '=', 'blub'),             )         ))      def test_mixed_predicate_and_first(self):         result = parse('hostname = foo && role=cpe || bla=blub')         print(result.debug())  # xxx debug statement         self.assertequal(result, conjunction(             predicate('hostname', '=', 'foo'),             disjunction(                 predicate('role', '=', 'cpe'),                 predicate('bla', '=', 'blub'),             )         ))      def test_complex(self):         result = parse(             'a=1 && b=2 || c=3 && d=4 || e=5 || f=6 && g=7 && h=8',             debug=true         )         print(result.debug())  # xxx debug statement          expected = disjunction(             conjunction(                 predicate('a', '=', '1'),                 predicate('b', '=', '2'),             ),             conjunction(                 predicate('c', '=', '3'),                 predicate('d', '=', '4'),             ),             predicate('e', '=', '5'),             conjunction(                 predicate('f', '=', '6'),                 predicate('g', '=', '7'),                 predicate('h', '=', '8'),             ),         )          self.assertequal(result, expected) 

if using precedence declarations, both and , or should declared left, not nonassoc. nonassoc means a or b or c illegal; left means interpreted (a or b) or c) , right means a or (b or c). (given semantics of and , or, makes no difference whether left or right chosen, left preferable in such cases.)

with precedence relationships, possible write extremely simple grammar:

query: predicate      | query , query      | query or query 

(usually, there entry parenthesized expressions.)

the above not chaining looking for. post-parse walking tree, preference. possible chain on fly, using grammar explicit precedence.

explicit precedence means grammar defines possible; in particular, since and binds more tightly or, not possible have conjunction: predicate , disjunction precisely because production implies second operand and disjunction, not desired outcome. case, want common cascading sequence:

query       : disjunction  # redundant, possibly useful didactic purposes disjunction : conjunction             | disjunction or conjunction   # left associative conjunction : predicate             | conjunction , predicate 

with grammar, chaining straight-forward, requires explicit test in actions (eg., if p.slice(1).type == 'conjunction:) arguably bit ugly.

ideally, want trigger correct action directly grammar, imply (which similar grammar):

conjunction: predicate                 # p[0] = p[1]            | predicate , predicate                 # p[0] = conjunction(p[1], p[3])            | conjunction , predicate                 # p[0] = conjunction(*(p[1]._predicates + [p[3]]) 

the problem above rules second , third both apply a , b, since after reducing a predicate have both option reduce conjunction or shift and immediately. in case, want parser resolve shift-reduce conflict unconditionally shifting, do, after producing warning. explicit solution, need conjunction in third rule real conjunction, @ least 1 and operator.

with in mind, can shift unit productions top of cascade, resulting in following:

query      : disjunction            | conjunction            | predicate disjunction: predicate or predicate            | conjunction or predicate            | disjunction or predicate conjunction: predicate , predicate            | conjunction , predicate 

now have no need conditionals in actions, because know have in every case.

def p_query(self, p):     '''     query : disjunction           | conjunction           | predicate     '''     p[0] = p[1]  def p_disjunction1(self, p):     '''     disjunction: predicate or predicate                | conjunction or predicate     '''     p[0] = disjunction(p[1], p[3])  def p_disjunction2(self, p):     '''     disjunction: disjunction or predicate     '''     p[0] = disjunction(*(p[1]._predicate + [p[3]])  def p_conjunction1(self, p):     '''     conjunction: predicate , predicate     '''     p[0] = conjunction(p[1], p[3])  def p_conjunction2(self, p):     '''     conjunction: conjunction , predicate     '''     p[0] = disjunction(*(p[1]._predicate + [p[3]]) 

notes

  1. the grammar provided fine case of 2 precedence levels, number of productions ends being quadratic in number of levels. if annoying, alternative model more unit productions:

    query         : disjunction disjunction   : conjunction               | disjunction_2 disjunction_2 : conjunction or predicate               | disjunction_2 or predicate conjunction   : predicate               | conjunction_2 conjunction_2 : predicate , predicate               | conjunction_2 , predicate 
  2. if don't insist on parser objects being immutable, combine both of chaining functions (p_conjunction2 , p_disjunction2) single function:

    def p_chain(self, p):     '''     conjunction: conjunction , predicate     disjunction: disjunction or predicate     '''     p[0] = p[1]     p[0]._predicate.append(p[3]) 

    additional simplification achieved making value of operator tokens and , or constructor instead of matched string. (the matched string redundant, anyway.) allow constructor functions (p_disjunction1 , p_conjunction1 replaced single function:

    def t_and(self, t):     r'&&'     t.value = conjunction     return t  def t_or(self, t):     r'\|\|'     t.value = disjunction     return t  # ...  def p_construct(self, p):     '''     disjunction: predicate or predicate                | conjunction or predicate     conjunction: predicate , predicate     '''     p[0] = p[2](p[1], p[3]) 

Comments

Popular posts from this blog

html - How to set bootstrap input responsive width? -

javascript - Highchart x and y axes data from json -

javascript - Get js console.log as python variable in QWebView pyqt -