python - How can I edit my parser to properly group "AND" and "OR" predicates? -
i trying write small parser able parse simple key = value
queries. should smart enough handle and
, or
groups, and
having higher precendence. example text-input:
a = 10 && b = 20 = 10 || b = 20 = 10 && b = 20 || c = 30
the first 2 trivial. last should group first 2 predicates "and" group, , group should grouped in "or" group.
i have basics down, got stuck on proper grouping. using ply uses flex/bison/lex/yacc syntax define grammar. if i'm totally heading down wrong track existing syntax please let me know... valuable learning experience concerning parsers.
i've tried setting precedence, don't think it's caused reduce/reduce conflict. think it's more of issue of way i've defined grammar in general, can't figure out need change.
below current implementation , unit-test file. test-file should understanding expected output. there's 1 failing test. that's 1 causes me headaches.
the tests can run using builtin unittest
module, but, execute print
statements in tests, suggest using pytest
intercepts , causes less of mess. example (assuming both files in same folder):
python -m venv env ./env/bin/pip install pytest ./env/bin/pytest test_query_string.py
filename: queryparser.py
import logging collections import namedtuple import ply.lex lex import ply.yacc yacc log = logging.getlogger(__name__) predicate = namedtuple('predicate', 'key operator value') class production: def __repr__(self): preds = [repr(pred) pred in self._predicates] return '%s(%s)' % (self.__class__.__name__, ', '.join(preds)) def __eq__(self, other): return ( self.__class__ == other.__class__ , self._predicates == other._predicates) def debug(self, indent=0, aslist=false): lines = [] lines.append(' ' * indent + self.__class__.__name__) predicate in self._predicates: if hasattr(predicate, 'debug'): lines.extend(predicate.debug(indent + 1, aslist=true)) else: lines.append(' ' * (indent+1) + repr(predicate)) if aslist: return lines else: return '\n'.join(lines) class conjunction(production): def __init__(self, *predicates): self._predicates = predicates class disjunction(production): def __init__(self, *predicates): self._predicates = predicates def parse(query: str, debug=false) -> predicate: lexer = querylexer().build() parser = queryparser().build() if debug: output = parser.parse(query, debug=log) else: output = parser.parse(query) return output or [] class querylexer: tokens = ( 'word', 'operator', 'quote', 'and', 'or' ) t_ignore = ' \t' t_quote = '"' def t_error(self, t): log.warning('illegal character %r', t.value[0]) t.lexer.skip(1) def t_word(self, t): r'\w+' return t def t_operator(self, t): r'(=|!=|>|<|<=|>=)' return t def t_and(self, t): r'&&' return t def t_or(self, t): r'\|\|' return t def build(self, **kwargs): self.lexer = lex.lex(module=self, **kwargs) class queryparser: precedence = ( ('nonassoc', 'or'), ('nonassoc', 'and'), ) def p_query(self, p): ''' query : disjunction | conjunction | predicate ''' p[0] = p[1] def p_disjunction(self, p): ''' disjunction : predicate or predicate | predicate or conjunction | predicate or disjunction ''' output = [p[1]] if p.slice[3].type == 'disjunction': # can merge multiple chanined disjunctions output.extend(p[3]._predicates) else: output.append(p[3]) p[0] = disjunction(*output) def p_conjunction(self, p): ''' conjunction : predicate , predicate | predicate , conjunction | predicate , disjunction ''' if len(p) == 4: output = [p[1]] if p.slice[3].type == 'conjunction': # can merge multiple chanined disjunctions output.extend(p[3]._predicates) else: output.append(p[3]) p[0] = conjunction(*output) else: p[0] = conjunction(p[1]) def p_predicate(self, p): ''' predicate : maybequoted operator maybequoted ''' p[0] = predicate(p[1], p[2], p[3]) def p_maybequoted(self, p): ''' maybequoted : word | quote word quote ''' if len(p) == 4: p[0] = p[2] else: p[0] = p[1] def p_error(self, p): """ panic-mode rule parser errors. """ if not p: log.debug('syntax error @ eof') else: self.parser.errok() log.error('syntax error @ %r', p) def build(self): self.tokens = querylexer.tokens self.parser = yacc.yacc(module=self, outputdir='/tmp', debug=true) return self.parser
filename: test_query_string.py
import unittest queryparser import parse, conjunction, disjunction, predicate class testquerystring(unittest.testcase): def test_single_equals(self): result = parse('hostname = foo') self.assertequal(result, predicate('hostname', '=', 'foo')) def test_single_equals_quoted(self): result = parse('hostname = "foo"') self.assertequal(result, predicate('hostname', '=', 'foo')) def test_anded_equals(self): result = parse('hostname = foo && role=cpe') self.assertequal(result, conjunction( predicate('hostname', '=', 'foo'), predicate('role', '=', 'cpe'), )) def test_ored_equals(self): result = parse('hostname = foo || role=cpe') self.assertequal(result, disjunction( predicate('hostname', '=', 'foo'), predicate('role', '=', 'cpe'), )) def test_chained_conjunction(self): result = parse('hostname = foo && role=cpe && bla=blub') print(result.debug()) # xxx debug statement self.assertequal(result, conjunction( predicate('hostname', '=', 'foo'), predicate('role', '=', 'cpe'), predicate('bla', '=', 'blub'), )) def test_chained_disjunction(self): result = parse('hostname = foo || role=cpe || bla=blub') print(result.debug()) # xxx debug statement self.assertequal(result, disjunction( predicate('hostname', '=', 'foo'), predicate('role', '=', 'cpe'), predicate('bla', '=', 'blub'), )) def test_mixed_predicates(self): result = parse('hostname = foo || role=cpe && bla=blub') print(result.debug()) # xxx debug statement self.assertequal(result, disjunction( predicate('hostname', '=', 'foo'), conjunction( predicate('role', '=', 'cpe'), predicate('bla', '=', 'blub'), ) )) def test_mixed_predicate_and_first(self): result = parse('hostname = foo && role=cpe || bla=blub') print(result.debug()) # xxx debug statement self.assertequal(result, conjunction( predicate('hostname', '=', 'foo'), disjunction( predicate('role', '=', 'cpe'), predicate('bla', '=', 'blub'), ) )) def test_complex(self): result = parse( 'a=1 && b=2 || c=3 && d=4 || e=5 || f=6 && g=7 && h=8', debug=true ) print(result.debug()) # xxx debug statement expected = disjunction( conjunction( predicate('a', '=', '1'), predicate('b', '=', '2'), ), conjunction( predicate('c', '=', '3'), predicate('d', '=', '4'), ), predicate('e', '=', '5'), conjunction( predicate('f', '=', '6'), predicate('g', '=', '7'), predicate('h', '=', '8'), ), ) self.assertequal(result, expected)
if using precedence declarations, both and
, or
should declared left
, not nonassoc
. nonassoc
means a or b or c
illegal; left
means interpreted (a or b) or c)
, right
means a or (b or c)
. (given semantics of and
, or
, makes no difference whether left
or right
chosen, left
preferable in such cases.)
with precedence relationships, possible write extremely simple grammar:
query: predicate | query , query | query or query
(usually, there entry parenthesized expressions.)
the above not chaining looking for. post-parse walking tree, preference. possible chain on fly, using grammar explicit precedence.
explicit precedence means grammar defines possible; in particular, since and
binds more tightly or
, not possible have conjunction: predicate , disjunction
precisely because production implies second operand and
disjunction, not desired outcome. case, want common cascading sequence:
query : disjunction # redundant, possibly useful didactic purposes disjunction : conjunction | disjunction or conjunction # left associative conjunction : predicate | conjunction , predicate
with grammar, chaining straight-forward, requires explicit test in actions (eg., if p.slice(1).type == 'conjunction:
) arguably bit ugly.
ideally, want trigger correct action directly grammar, imply (which similar grammar):
conjunction: predicate # p[0] = p[1] | predicate , predicate # p[0] = conjunction(p[1], p[3]) | conjunction , predicate # p[0] = conjunction(*(p[1]._predicates + [p[3]])
the problem above rules second , third both apply a , b
, since after reducing a
predicate
have both option reduce conjunction
or shift and
immediately. in case, want parser resolve shift-reduce conflict unconditionally shifting, do, after producing warning. explicit solution, need conjunction
in third rule real conjunction, @ least 1 and
operator.
with in mind, can shift unit productions top of cascade, resulting in following:
query : disjunction | conjunction | predicate disjunction: predicate or predicate | conjunction or predicate | disjunction or predicate conjunction: predicate , predicate | conjunction , predicate
now have no need conditionals in actions, because know have in every case.
def p_query(self, p): ''' query : disjunction | conjunction | predicate ''' p[0] = p[1] def p_disjunction1(self, p): ''' disjunction: predicate or predicate | conjunction or predicate ''' p[0] = disjunction(p[1], p[3]) def p_disjunction2(self, p): ''' disjunction: disjunction or predicate ''' p[0] = disjunction(*(p[1]._predicate + [p[3]]) def p_conjunction1(self, p): ''' conjunction: predicate , predicate ''' p[0] = conjunction(p[1], p[3]) def p_conjunction2(self, p): ''' conjunction: conjunction , predicate ''' p[0] = disjunction(*(p[1]._predicate + [p[3]])
notes
the grammar provided fine case of 2 precedence levels, number of productions ends being quadratic in number of levels. if annoying, alternative model more unit productions:
query : disjunction disjunction : conjunction | disjunction_2 disjunction_2 : conjunction or predicate | disjunction_2 or predicate conjunction : predicate | conjunction_2 conjunction_2 : predicate , predicate | conjunction_2 , predicate
if don't insist on parser objects being immutable, combine both of chaining functions (
p_conjunction2
,p_disjunction2
) single function:def p_chain(self, p): ''' conjunction: conjunction , predicate disjunction: disjunction or predicate ''' p[0] = p[1] p[0]._predicate.append(p[3])
additional simplification achieved making value of operator tokens
and
,or
constructor instead of matched string. (the matched string redundant, anyway.) allow constructor functions (p_disjunction1
,p_conjunction1
replaced single function:def t_and(self, t): r'&&' t.value = conjunction return t def t_or(self, t): r'\|\|' t.value = disjunction return t # ... def p_construct(self, p): ''' disjunction: predicate or predicate | conjunction or predicate conjunction: predicate , predicate ''' p[0] = p[2](p[1], p[3])
Comments
Post a Comment