python/parserVgq.py
# knuth lr(k) parser
#---------- output error debug ----------
def out(*ee, sep = ', '):
print(strL(*ee, sep = sep))
def outS(*ee, sep = ', '):
print(strS(*ee, sep = sep))
def err(*e, sep = ', '):
raise Exception('error: ' + strL(*e, sep = sep))
exit()
def errS(*e, sep = ', '):
raise Exception('error: ' + strS(*e, sep = sep))
exit()
dbgLev = 10
def dbg(l, *e, sep = ', '):
if dbgLev >= l:
print('dbg: ' + strL(*e, sep = sep))
def dbgS(l, *e, sep = ', '):
if dbgLev >= l:
print('dbg: ' + strS(*e, sep = sep))
import re
#---------- string representations ----------
""" use method strL for nice lont string representations """
from datetime import * ;
import re;
strSep1 = '--------------------------------'
def strL(*ee, sep = ', '):
""" return a long stringrepresentation
elements of ee in long (strL) representation, separated by sep
deep dive into collections, represented by strS
"""
return sep.join(e if type(e) == str else e.strL() if hasattr(type(e), 'strL') else strC(e) for e in ee)
def strS(*ee, sep = ', '):
return sep.join(e if type(e) == str else strC(e) for e in ee)
def strC(e):
"""return a str represntig e. recursively expand collections, otherwise use str"""
t = type(e)
if t == str:
return repr(e) if strCW.fullmatch(e) == None else e
elif t == tuple:
return '(' + ', '.join(strC(v) for v in e) + ')'
elif isinstance(e, list): # t == list:
return '[' + ', '.join(strC(v) for v in e) + ']'
elif t == set or t == frozenset:
return '{' + strSo(e, fmt=strC) + '}'
elif isinstance(e, dict):
return '{' + ', '. join(strC(k) + ': ' + strC(v) for k, v in e.items()) + '}'
else:
return str(e)
def strSo(ee, sep=', ', fmt=str):
return sep.join(sorted([fmt(e) for e in ee]))
strCW = re.compile('\w+')
strNSs = re.compile('\s')
def strNS(s):
return strC(s) if not isinstance(s, str) else s if s != '' and strNSs.search(s) == None else repr(s)
def strNSsq(s):
return ' '.join(strNS(e) for e in s)
def strNSset(s):
return strNSsq(sorted(s))
#---------- ds: dictionary to set ----------
def dsAdd(g, e, p):
""" add p to set at g[e]
return True if g has been changed
"""
if e in g:
s = g[e]
o = len(s)
s.add(p)
return o < len(s)
else:
g[e] = {p}
return True
def dsUnion(g, e, s):
""" update/create g[e] with the union of s
return True if g has been changed
"""
if len(s) < 1:
return False
if e in g:
o = len(g[e])
g[e] |= s
return len(g[e]) > o
else:
g[e] = s.copy()
return True
def dsUpdate(le, ri):
""" update/create le[k] with the union of ri[k]
return True if le has been changed
"""
u = False
for k,v in ri.items():
assert len(v) > 0
if k not in le:
le[k] = set()
o = len(le[k])
le[k] |= v
if o!= len(le[k]):
u = True
return u
#---------- dd: dictionary tree ----------
def ddPut(d, *key):
""" add the key sequence to tree dict d
'' is a special key for the value at this node
"""
e = d
for k in key[0:-2]:
if k not in e:
e[k] = {}
elif not isinstance(e[k], dict):
e[k] = {'': e[k]}
e = e[k]
k, v = key[-2:]
if k not in e or not isinstance(e[k], dict):
e[k] = v
else:
e[k][''] = v
dbg(30, 'ddPut after key=', key, 'd=', d)
def ddGet(d, *key):
""" return the value in d at the keySequence key. or None if not present """
e = d
for k in key:
if not isinstance(e, dict) or k not in e:
return None
e = e[k]
return e if not isinstance(e, dict) else e[''] if '' in e else None
def ddRed(d, dflt = None):
""" reduce tree d: if all nodes point to the same value replace it by the single value
if dflt != None, remove all entries with value dflt
"""
hasDi = False
for k, v in d.items():
if isinstance(v, dict):
v = ddRed(v, dflt)
d[k] = v
hasDi |= isinstance(v, dict)
dbg(20, 'ddRed di', hasDi)
if not hasDi:
vv = set(d.values())
dbg(20, 'ddRed vv', vv)
if len(vv) == 1:
return anyOf(vv)
if dflt != None:
assert '' not in d or dflt == d['']
for k in [k for k,v in d.items() if v == dflt and k != '']:
del d[k]
d[''] = dflt
return d
#---------- div helpers ----------
def anyOf(c):
""" return an arbitrary Element of a collection or iterator """
for a in c:
return a
err('anyOf empty', c)
#---------- PObj: the root class ----------
class PObj:
"root and container for all our objects"
end = '!'
sqStr = True # a sequence of nonterminals is representend as: a single string if sqStr is True else a tuple of strings
sqEmpty = '' if sqStr else ()
sqEnd = end if sqStr else (end, )
lrK = 0
def setName(self, nm):
self.name = nm
def __init__(self, nm = ''):
self.setName(nm)
def __str__ (self):
return self.name
def strA(self, *aa):
return str(self) + '{' + ', '.join(a + '=' + strC(getattr(self, a)) if (ex := a.find('=')) < 0 else a[0:ex] + '=' + strC(getattr(self, a[ex+1:])) for a in aa) + '}'
#---------- Rule: grammar rule ----------
class Rule (PObj):
""" a Rule has a name (a symbol or nonterminal) and a body of terminals and nonterminals """
rules = []
ruleAll = []
s2r = {}
def __init__(self, sy, bd):
""" ini and add to ruleAll and s2r """
super().__init__(sy)
self.body = bd
self.pos = None
self.gr = None
self.gq = False
self.suq = set()
Rule.ruleAll.append(self)
if sy != '':
if sy in Rule.s2r:
Rule.s2r[sy].append(self)
else:
Rule.s2r[sy] = [self]
def strA(self, *aa):
return super().strA(type(self).__name__[4:] + '=' + aa[0] if aa[0].find('=') < 0 else aa[0], *aa[1:])
def strL(self):
return self.strA('body', 'gr')
@staticmethod
def clear():
PObj.lrK = 0
Rule.rules.clear()
Rule.ruleAll.clear()
Rule.s2r.clear()
State.clear()
@staticmethod
def add(*src):
"""add new rules: each rule is a string
elements are words (double or sinble quoted strings, spechial char
if an unquoted element is found as a rule name, it designates a rule, otherwise a terminal
"""
for s in src:
spl = re.findall('\w+|\'(?:[^\']|\'\')*\'?|"(?:[^"]|"")*"?|\S', s) # words | single quoted | doubble quoted | single nonspace char
# (?: non capturing group, otherwise findall will use group!
if len(spl) > 0 and spl[-1][0] in "\"'" and (len(spl[-1]) < 2 or spl[-1][0] != spl[-1][-1]):
err('missing endquote string:', spl[-1], 'in source', s, src)
assert(spl[1] == '=')
RuleSeq(spl[0], spl[2:])
def makePos(self, l):
""" make positions for this rule """
assert(self.pos == None)
self.pos = [RulePos(self, ix) for ix in range(l)]
dsAdd(self.pos[-1].prd, PObj.sqEmpty, self)
def gqGen(self):
assert self.gq == False
g = {}
self.gqGe2(g, [])
assert self.gq == False
self.gq = g
if isinstance(self, RuleOr):
for b in self.body:
if isinstance(b, Rule):
b.suq.add(self)
else:
l = len(self.body)
for bx in range(l):
b = self.body[bx]
if isinstance(b, Rule):
b.suq.add(self if bx == l-1 else self.pos[bx+1])
def gqGe2(self, gg, h):
if self.gq == True:
return
elif self.gq != False:
hT = tuple(h)
for a, tt in self.gq.items():
for t in tt:
dsAdd(gg, a, hT+t)
elif len(self.pos) <= 1:
dsAdd(gg, '', tuple(h+[self]))
else:
self.gq = True
h.append(self.pos[1])
self.gqGe3(gg, h)
h.pop()
assert self.gq == True
self.gq = False
def gqExt(self):
if '' not in self.gq:
return
for p in self.gq['']:
if self.gqEx2(['bot', p if isinstance(p, RulePos) else p.pos[-1]], self.gq):
dbg(1, 'gqEx2 True')
def gqEx2(self, h, to):
p = h[-1]
assert isinstance(p, RulePos)
if p.pos + 1 == len(p.rule.pos):
return True
assert isinstance(RuleSeq)
b = p.rule.body[p.pos]
n = p.rule.pos[p.pos+1]
h[-1] = n
if isinstance(b, str):
dsAdd(to, b, tuple(h))
elif isinstance(b, RuleSeq):
h.append(b.pos[0])
res = self.gqEx2(tuple(h), to)
h.pop()
else:
res = False
h.append(0)
for b in b.body:
h[-1] = 0
res |= self.gqEx2(tuple(h), to)
h.pop()
h[-1] = p
return res
def grGen(self):
""" generate gr (left subrules, recursively) for the gr already computed
return True/False if an update needs a further round of grGen
"""
u = False
for r in list(self.gr.keys()):
u |= dsUpdate(self.gr, r.gr)
return u
@staticmethod
def prdGen():
u = True
c = 0
while u:
c += 1
if c > 40:
err('too many')
u = False
for r in reversed(Rule.rules):
u |= r.prdGen()
dbg(15, 'prdGen round')
dbg(3, 'prdGen', c, 'rounds')
@staticmethod
def lahGen():
for r in Rule.rules:
for p in r.pos:
p.lah.clear()
dsUpdate(p.lah, p.prB)
dsAdd(Rule.rules[0].pos[-1].lah, PObj.sqEmpty, Rule.rules[0])
u = True
lC = 0
uC = 1
while uC > 0:
lC += 1
uC = 0
for r in Rule.rules:
uC += r.lahGen()
dbg(5, f'lahGen loop {lC}, upd {uC}')
@staticmethod
def genK(k):
""" generate lookAheads for lr=K """
if k <= PObj.lrK:
err(f"genK({k} bur lrK already {PObj.lrK}")
PObj.lrK = k
Rule.prdGen()
Rule.lahGen()
if k == 1:
for r in Rule.rules:
r.gqGen()
for r in Rule.rules:
dbg(10, 'gqGen', r.strA('body', 'gq', 'suq'))
oo = {}
dsUpdate(oo, r.gq)
r.gqExt()
if oo != r.gq:
dbg(10, 'gqExt', r.strA('body', 'gq', 'suq'))
if dbgLev >= 10:
for r in Rule.rules:
dbg(10, 'gen', r.strA('body'), '\n\t' + '\n\t'.join(p.strA('prd', 'prB', 'lah') for p in r.pos))
@staticmethod
def finish():
""" after all rules are added, convert them to the final form and generate lrK=1 lookaheads """
#--- add RuleOr for nonterminals withe several alternatives and put them to Rule.rules
assert len(Rule.rules) == 0
for r in Rule.ruleAll.copy():
l = Rule.s2r[r.name]
if type(l) != list:
assert type(l) == RuleOr and r in l.body
else:
assert type(l) == list and r in l, r.strA('body') + ', l=' + strC(l)
if len(l) == 1:
Rule.s2r[r.name] = r
else:
Rule.s2r[r.name] = RuleOr(r.name + '@' + str(len(Rule.rules)), l)
Rule.rules.append(Rule.s2r[r.name])
r.setName(r.name + '@' + str(len(Rule.rules)))
Rule.rules.append(r)
#--- in all rules resolve symbols to its rule
for r in Rule.rules:
for i in range(len(r.body)):
b = r.body[i]
if type(b) == str:
if b[0] == "'" and b[-1] == "'" or b[0] == '"' and b[-1] == '"':
b = b[1: -1]
elif b in Rule.s2r:
b = Rule.s2r[b]
r.body[i] = b
assert b != PObj.end or r == Rule.rules[0]
r.makePos()
dbg(5, 'makePos body/pos len', len(r.body) , len(r.pos), r.strA('body','pos'))
#--- generate gr, the left subrules recursively
for r in Rule.rules:
r.grGe0()
u = True
while u:
u = False
for r in reversed(Rule.rules):
u |= r.grGen()
dbg(10, 'grGen round')
for r in Rule.rules:
dbg(10, 'grGen', r.strA('body', 'gr'))
Rule.genK(1)
#---------- RuleSeq: a Rule that is a sequence of terminals or rules ----------
class RuleSeq(Rule):
def __init__(self, sym, bd):
super().__init__(sym, bd)
def makePos(self):
super().makePos(len(self.body)+1)
def gqGe3(self, gg, h):
if len(self.body) == 0:
dsAdd(gg, '', tuple(h +[self]))
else:
b = self.body[0]
dsAdd(gg, b, tuple(h))
if isinstance(b, Rule):
b.gqGe2(gg, h)
def grGe0(self):
""" compute inital value for gr"""
self.gr = {self.body[0]: {self.pos[1]}} if len(self.body) > 0 and isinstance(self.body[0], Rule) else {}
def prdGen(self):
""" compute prd and prB from the values already computed (one step) """
pp = self.pos[-1].prd
ll = self.pos[-1].prB
p0 = self.pos[0].prd
l0 = self.pos[0].prB
assert len(ll) == 0
assert len(pp) == 1 and Rule.sqEmpty in pp and pp[Rule.sqEmpty] == {self}
for bx in range(len(self.body)-1, -1, -1):
b = self.body[bx]
o = self.pos[bx]
oN = self.pos[bx+1]
pN = {}
lN = {}
if isinstance(b, str):
a = b if PObj.sqStr else (b, )
for p in pp.keys():
if len(p) < PObj.lrK:
dsAdd(pN, a+p, oN)
else:
dsAdd(lN, (a+p)[0:PObj.lrK], oN)
for l in ll.keys():
dsAdd(lN, (a+l)[0:PObj.lrK], oN)
else:
for a, t in b.pos[0].prd.items():
if len(a) == PObj.lrK:
if PObj.sqEmpty in pp:
dsUnion(pN, a, t)
if any(len(p) > 0 for p in pp) or any(len(l) > 0 for l in ll):
dsUnion(lN, a, t)
else:
for p in pp.keys():
if len(a) + len(p) <= PObj.lrK:
dsUnion(pN, a+p, t)
else:
dsUnion(lN, (a+p)[0: PObj.lrK], t)
for l in ll.keys():
if len(a) + len(l) >= PObj.lrK:
dsUnion(lN, (a+l)[0: PObj.lrK], t)
for a, t in b.pos[0].prB.items():
if len(a) == PObj.lrK:
dsUnion(lN, a, t)
o.prd = pN
o.prB = lN
pp = pN
ll = lN
return p0 != self.pos[0].prd or l0 != self.pos[0].prB
def lahGen(self):
""" compute lah from the values already computed of pos[-1].lah and prd and prB (one step) """
u = False
pp = self.pos[-1].lah
nn = pp
#dbg(1, 'lahGen0', self.strA('body'), pp)
for qx in range(len(self.pos)-2, -1, -1):
q = self.pos[qx]
b = self.body[qx]
if isinstance(b, Rule):
u |= b.pos[-1].lahExt(nn)
q.lahExt(pp)
nn = q.lah
#dbg(1, 'lahGen2', q.strA('lah', 'prd', 'prB'))
return u
#---------- RuleOr: a Rule conisting of oa choice of terminals or rules ----------
class RuleOr(Rule):
def __init__(self, sym, bd):
super().__init__(sym, bd)
def makePos(self):
super().makePos(2)
def gqGe3(self, gg, h):
h.append(self.pos[1])
for b in self.body:
dsAdd(gg, b, tuple(h))
if isinstance(b, Rule):
b.gqGe2(gg, h)
h.pop()
def grGe0(self):
""" compute inital value for gr"""
self.gr = {b: {self.pos[1]} for b in self.body}
def prdGen(self):
""" compute prd and prB from the values already computed (one step) """
assert len(self.pos[-1].prB) == 0
assert len(self.pos[-1].prd) == 1 and Rule.sqEmpty in self.pos[-1].prd and self.pos[-1].prd[Rule.sqEmpty] == {self}
pp = {}
ll = {}
for b in self.body:
if isinstance(b, str):
dsAdd(pp, b if PObj.sqStr else(b,), self.pos[1])
else:
for a,t in b.pos[0].prd.items():
dsUnion(pp, a, t)
for a,t in b.pos[0].prB.items():
dsUnion(ll, a, t)
if self.pos[0].prd == pp and self.pos[0].prB == ll:
return False
self.pos[0].prd = pp
self.pos[0].prB = ll
return True
def lahGen(self):
""" compute lah from the values already computed of pos[-1].lah and prd and prB (one step) """
u = False
pp = self.pos[-1].lah
nn = {}
for b in self.body:
if isinstance(b, Rule):
u |= b.pos[-1].lahExt(pp)
# dbg(1, 'lahGen1', b.pos[-1].strA('rule', 'pos', 'lah'), 'nn=', nn)
self.pos[0].lahExt(pp)
return u
#---------- RulePos: position in a Rule ----------
class RulePos (PObj):
def __init__(self, r, p):
super().__init__(r.name + '#' + str(p))
self.rule = r
self.pos = p
self.prB = {}
self.prd = {}
self.lah = {}
def strL(self):
return self.strA('rule', 'pos')
def lahExt(self, ee):
""" extend self.lah by ee, return True if lah has been updated"""
u = False
for a, p in self.prd.items():
if len(a) == PObj.lrK:
u |= dsUnion(self.lah, a, p)
else:
for e in ee.keys():
u |= dsUnion(self.lah, (a+e)[0: PObj.lrK], p)
return u
#---------- State: a set of positions ----------
class State (PObj):
"""
state is set of positions, not (set of positions, lookAhead) as for knuth
thus, we reduce the number of states
"""
p2s = {}
state0 = None
lrErr = None
def __init__(self, pa):
super().__init__('s' + str(len(State.p2s)))
assert isinstance(pa, frozenset)
assert all(isinstance(p, RulePos) for p in pa)
assert pa not in State.p2s
self.pa = pa
self.go = None
State.p2s[pa] = self
@staticmethod
def getOrMake(pa):
pa = frozenset(pa)
return State.p2s[pa] if pa in State.p2s else State(pa)
@staticmethod
def clear():
State.states = []
State.p2s.clear()
def strL(self):
return self.strA('pa', 'go')
@staticmethod
def makeAll():
State.state0 = State(frozenset([Rule.rules[0].pos[0]]))
State.state0.goExp()
State.err = ''
for s in State.p2s.values():
dbg(5, 'goExp', s.strA('pa', 'go'))
for lrK in range(1,5):
dbg(1, 'gen lrK', lrK)
if lrK > 1:
Rule.genK(lrK)
cnf = False
for s in State.p2s.values():
cnf |= s.goFix()
if not cnf:
break
else:
State.err = f'still conflicts at lrK {lrK}'
if dbgLev >= 5:
dbg(5, 'makeAll lr={lrK}', State.err, '\n\t' + '\n\t'.join(s.strA('pa', 'go') for s in State.p2s.values()))
def goExp(self):
""" compute lr1 go, and recursively create necessary states """
if self.go != None:
return
self.go = {}
for p in sorted(self.pa, key=lambda p: p.name): # sort for debugging: to ensure sequence of states
dsUpdate(self.go, p.lah if Rule.sqStr else {'' if len(a) == 0 else a[0]: t for a,t in p.lah.items()})
r = p.rule
if p.pos < 1:
dsUpdate(self.go, r.gr)
elif isinstance(r, RuleSeq) and p.pos < len(r.body):
r = r.body[p.pos]
if isinstance(r, Rule):
dsAdd(self.go, r, p.rule.pos[p.pos+1])
dsUpdate(self.go, r.gr)
for a, tt in self.go.items():
t1 = frozenset(t for t in tt if isinstance(t, RulePos))
if len(t1) > 0:
s = State.getOrMake(t1)
if isinstance(a, Rule):
assert tt == t1
self.go[a] = s
else:
self.go[a] = (tt - t1) | {s}
s.goExp()
def goFix(self):
""" use lah, to extend go to a tree
if conflicts (i.e. grammar nor lr(lrK) return True
else adapt go accordingly
"""
#--- compute lah
if len(self.pa) == 1:
ll = anyOf(self.pa).lah
else:
ll = {}
for p in self.pa:
dsUpdate(ll, p.lah)
assert all(PObj.end not in k for k in ll.keys())
#--- build tree of lookAheads and detect conflicts
e = {}
cnf = False
for a, t in sorted(ll.items()): # sort for debugging: to ensure sequence of conflicts
pp = {u for u in t if isinstance(u, RulePos)}
if len(pp) < 1:
tt = t
else:
assert len(a) > 0
gg = self.go[a[0]]
if not isinstance(gg, set):
if isinstance(gg, Rule):
if not isinstance(gg, State):
errS('f3', self, a, t, self.go[a[0]])
ddPut(e, *a, gg)
continue
s = {u for u in gg if isinstance(u, State)}
assert len(s) == 1
s = anyOf(s)
dbg(13, 'goFix confliXyy', a, pp, s)
assert all(p in s.pa for p in pp)
tt = (t - pp) | {s}
if len(tt) == 1:
ddPut(e, *(a if len(a) > 0 else ['']), anyOf(tt))
else:
dbg(3, 'goFix conflict in ', self.strA('pa'), ', lah=', strNSsq(a), ', to=', strSo(v.strA('pa') if isinstance(v, State) else str(v) for v in tt), sep='')
cnf = True
if cnf:
return True
#--- find default and reduce tree
if '' in e:
dflt = e['']
else:
rr = {p.rule for p in self.pa if p.pos == len(p.rule.pos)-1}
dflt = anyOf(rr) if len(rr) == 1 else None #anyOf(None
assert dflt == None or isinstance(dflt, Rule)
ddRed(e, dflt)
dbgS(10, 'goFixa3 red dflt', dflt, e)
# merge tree into go
for a,t in e.items():
assert a not in self.go or isinstance(t, dict) or t == self.go[a] or {t} == self.go[a], strS('a=',a, 't=', t, self.strA('pa', 'go'))
self.go[a] = t
if dflt != None:
dfltS = {dflt}
for k in [k for k,v in self.go.items() if v == dflt or v == dfltS]:
del self.go[k]
self.go[''] = dflt
dbg(5, 'goFixa9 go', self.strA('pa', 'go'))
return False
@staticmethod
def checkLR0():
""" check if the current LR(1) grammar is even LR(0)
return None if it is, otherwsie a string describing the conflicting state
LR(0) grammars are implemented using LR(1) grammars
LR(0) means, that a reduce step does not depend on lookahead.
a shift step however uses the char shifted (== lookahead) to determine the next state
"""
assert PObj.lrK == 1
for u in State.states:
assert len(u.to) > 0
fi = None
for k,v in u.to.items():
if not isinstance(k, RuleSym):
if fi == None:
fi = v
elif v != fi if isinstance(v, Rule) else not (isinstance(v, State) and isinstance(fi, State)):
return strL("grammar not LR(0) because of", u)
#---------- LahIter: input iterator ----------
class LahIter():
""" input iterator with __getitem__ for lookahead """
def __init__(self, l):
self.list = l
self.ix = -1
def __iter__(self):
return self
def __next__(self):
self.ix += 1
if self.ix < len(self.list):
return self.list[self.ix]
raise StopIteration()
def __getitem__(self, i):
i += self.ix+1
return '<before>' if i < 0 else self.list[i] if i < len(self.list) else PObj.end
#---------- Parser: the Parser ----------
class Parser:
pCnt = -1
@staticmethod
def eStr(e):
if type(e) != tuple and type(e) != list:
return str(e)
else:
return str(e[0]) + '>' + str(len(e)-1) + '(' + ', '.join(x if type(x) != tuple and type(x) != list else str(x[0]) + '>' + str(len(x)-1) for x in e[1:]) + ')'
@staticmethod
def parse(inp):
""" the parsing state machine """
assert len(State.state0.pa) == 1
assert len(q := State.state0.pa) == 1 and (q := anyOf(q)).pos == 0 and q.rule == Rule.rules[0]
r0 = Rule.rules[0]
dbgS(15, 'parsing for', r0, 'input', strNSsq(inp))
st = [(None, State.state0)]
inI = LahIter(inp)
Parser.pCnt = -1
cntL = 1000
act = 'start'
shiL = st[-1][1]
res = None
try: # end of input or syntax may throw Keyerror exceptions
while True:
Parser.pCnt += 1
e,s = st[-1]
lah = inI[0]
dbgS(10, 'parse', (str(Parser.pCnt) + ' ' + act).ljust(20), 'lah', lah, 'stck', len(st), '>', e, s)
if Parser.pCnt > cntL:
err(f"too many loops cnt={Parser.pCnt}, lah={lah} st=", st)
to = s.go
dbg(15, 'parse to0', to)
for fx in range(PObj.lrK): # dive into go tree along lookahead
to = to[inI[fx]] if inI[fx] in to else to[''] # Keyerror means syntax or end
dbg(20, 'parse to1', to)
if not isinstance(to, dict): break
else:
errS('parser to deeply nested lah=', lah, 'to=', to, 's=', s.strA('pa', 'go'))
if isinstance(to, State): # shift first lookahead
st.append((next(inI), to))
assert st[-1][0] == lah
shiL = to
act = 'shift ' + lah
lah = inI[0]
elif isinstance(to, Rule): # reduce with rule to
sx = len(st) - len(to.pos) + 1
p = [to] + [x0 for x0, x1 in st[sx : ]]
act = 'reduce ' + str(to)
del st[sx:]
st.append((p, st[-1][1].go[to]))
else:
err('parse else to', to, 'st=', st)
except KeyError:
if isinstance(to, State):
dbgS(5, 'parse keyError State to=', to, act)
elif isinstance(to, Rule):
dbgS(5, 'parse keyError Rule to=', to, sx, p, act)
res = p if len(st) == 1 else st[-1][0] # maybe we reduced st[1] !!!
elif isinstance(to, dict):
dbgS(5, 'parse keyError dict', to)
res = st[-1][0]
else:
err('parse keyError else', to)
except StopIteration:
err('parse internal')
dbg(20, 'parseEnd lah', lah, 'act', act, 'to', to
, 'stack =\n\t' + '\n\t'.join(strS(str(sx) + ': ' + Parser.eStr(st[sx][0]), 'state =', st[sx][1]) for sx in range(len(st)-1, -1, -1)))
if lah == PObj.end and len(st) <= 2 and res[0] == Rule.rules[0]:
dbgS(5, 'parsed', str(res[0]), 'from', len(inp), 'tokens, in', Parser.pCnt, 'steps,', len(State.p2s), 'states')
return res
else:
tNr = inI.ix
iA = inp[tNr+1: ] + [PObj.end]
iB = inp[0:tNr+1]
ex = set() # compute expected:
for p in shiL.pa:
for q in p.lah.keys():
if len(q) < PObj.lrK:
q += PObj.sqEnd
for i in range(min(len(iA), len(q))): # reduce amount of output if lrK>1, by showing only prefix of expected up to first difference to lah
if iA[i] != q[i]:
break
ex.add(q[0: i+1])
ex = ', '.join(strNSsq(q) for q in sorted(ex))
r = f"syntax {'at begin' if tNr < 0 else 'after ' + inI[-1]} tokenNr {tNr} expected: {ex}, not lah: {strNSsq(iA[-5:])}"
dbg(1, r)
dbg(5, 'last tokens', iB[-5:], 'tokennr', tNr, ', lah', iA[0:5]
, '\n\tpreceeding', iB[0: tNr+1]
, '\n\tfollowing ', iA
, '\n\tstack', len(st), 'res', res
, '\n\t\t' + '\n\t\t'.join(strS(str(sx) + ': ' + Parser.eStr(st[sx][0]), 'state =', st[sx][1]) for sx in range(len(st)-1, -1, -1))
)
return r
@staticmethod
def makeGr(*grammar):
"make Rule, Sym and Pos and State for given grammer"
out("grammar source", grammar)
Rule.clear()
Rule.add(*grammar)
Rule.finish()
State.makeAll()
@staticmethod
def testGr(nm, syn, *cs):
"""test named nm, with grammar syn and testcases cs"""
print(f"{nm} --- begin test " + strSep1)
Parser.makeGr(*syn)
if State.err != '':
print('error generating grammar', nm, State.err)
else:
Parser.test(nm, *cs)
@staticmethod
def test(nm, *tsts):
""" test for the current grammar, named nm the tests tsts """
# if tsts == None:
# tsts = ['2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (']
ll = 5
def treeSplit(nd, b):
"""split the tree nd into linkes,
each line contains the path to one (shifted) token or to one empty rule
intendation shows the children of a rule
the left ll characters, contain the shifted token or spaces for an empty rule
b is the current line, filled with the path of entered rules with the correct intendation
"""
if isinstance(nd, str):
return nd[0:ll-1].ljust(ll) + b[ll:] + '==' + nd
c = b + str(nd[0]) + ' '
if len(nd) <= 1:
return c + '==emptyRule'
r = treeSplit(nd[1], c)
c = len(c) * ' '
for ch in nd[2:]:
r += "\n" + treeSplit(ch, c)
return r
for i in range(len(tsts)) :
sT = re.findall(r'\w+|\S', tsts[i])
print(f"test begin {nm} {i} input: {strNSsq(sT)} {strSep1}")
assert all(len(t) == 1 if PObj.sqStr else len(t) > 0 for t in sT)
ast = Parser.parse(sT)
if isinstance(ast, str):
print(f'syntax test {nm} {i} : {ast}\n\tinput {strNSsq(sT)}')
else:
print(f"parsed {str(ast[0])} from {len(sT)} tokens, {Parser.pCnt} steps, {len(State.p2s)} states, Rules {len(Rule.rules)}")
print(treeSplit(ast, ll * ' '))
dbg(5, 'test', 'syntaxed' if isinstance(ast, str) else 'parsed', f"{nm} {i} input {strNSsq(sT)} {strSep1}")
@staticmethod
def test1():
"""my simple tests """
g = ["s=s+p", "s = p", "p = p '*' i", "p = i", "i = '(' s ')'", "i = '0'", "i = '1'", "i = '2'"]
Parser.testGr('arithExpr', g
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +')
g2 = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = epm i d", "epm = pm", "epm ="
, "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
g = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = i d", "epm = pm epm", "epm ="
, "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
Parser.testGr('arithExprPM' , ["s=s pm p", "s=p", "p=p*i", "p=p/i", "p=i", "i=( s )", "i = j"
, "j=epm d", "j=j d", "epm = pm epm", "epm = ", "pm=+", "pm=-", "d=0", "d=1", "d = 2 "]
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
, '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )', '- - 2 * * ++1 ** -2 * 1 + 0')
Parser.testGr('arithExprPE' , ["s=s pm p", "s=p", "p=p*e", "p=p/e", "p=e", "e=i", "e=i**e", "i=( s )", " i=j"
, "j=epm d", "j=j d", "epm = pm epm", "epm = ", "pm=+", "pm=-", "d=0", "d=1", "d = 2 "]
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
, '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )', '- - 2 * * ++1 ** -2 * 1 + 0')
Parser.testGr('arithExprPE2' , ["s=s pm p", "s=p", "p=p*e", "p=p/e", "p=e"
, "e=i", "e=i**e", "e=pm e", "pm=+", "pm=-", "i=( s )", "i = dd", "dd=d", "dd=d dd", "d=0", "d=1", "d = 2 "]
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
, '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )', '- - 2 * * ++1 ** -2 * 1 + 0')
Parser.testGr('LR(2)?' , ["S = AA 'a' 'a'", "S = AB 'a' 'b'", "AA =", "AB ="]
,"a a", "a b", "a c", "b a")
@staticmethod
def testKnuth():
"""some of the examples given by Donald E. Knuth. 1965. On the Translation of Languages from Left to Right """
Parser.testGr('knuth(1)'
, ["S = A D", "A = 'a' C", "B = 'b' 'c' 'd'", "C = B E", "D =", "E = 'e'"]
, "a b c d e", "a b x d e", "a b", "a b c d e f")
Parser.testGr('knuth(6)'
, ["S = 'a' A 'c'", "A = 'b' A 'b'", "A = 'b'"]
, "a b b b c", "a b c", "a b b c", "a b b b b b b b c e")
Parser.testGr('knuth(7) lr(0)'
, ["S = 'a' A 'c'", "A = A 'b' 'b'", "A = 'b'"]
, "a b b b c", "a b c", "a b b c", "a b b b b b b b c e")
Parser.testGr('knuth(16x) lr(3)'
, ["S=B C", "B=C e", "B = ", "C=D", "C=D c", "D=", "D=d"]
, "", "c", "c e", "c e d")
Parser.testGr('knuth(24)'
, ["S =", "S = 'a' A 'b' S", "S = 'b' B 'a' S", "A =", "A = 'a' A 'b' A", "B =", "B = 'b' B 'a' B"]
, "", "a b", "b a", "a b b", "a a b b", "a b a b", "b a a b")
#---------- Test: div internal tests ----------
class Test:
def re():
x = ' ab cd e?f *+??'
print(f"x={x}")
print("re: lookahead assertion and lookbehind assertions to find boundaries between words and or special character")
for y in re.finditer(r'\s+|(?<=\S)(?=\S)((?<=\W)|(?=\W))', x):
print([y.start(),y.end(), x[y.start(): y.end()+1]])
print("easier and more stable against leading/trailing spaces: use findall for words or special chars")
for y in re.finditer(r'\w+|\S', x):
print([y.start(),y.end(), x[y.start(): y.end()]])
print(re.findall(r'\w+|\S', x))
x += "' '' b 'oder1''1.2und. 3 ' '.a.234 4.. single' !"
print('mit strings and numbers:', x)
print(re.findall(r"""'(?:''|[^'])*' # single quoted strings, doubble apostrophs for a single
# non capturing group (?:...) otherwise findall will return groups!
|\d+(?:\.\d*) # number with leading digits and possible decimal point/digits
|\.\d+ # number with leading decimal points and digits
|\w+ # words
|\S""" # special chars
, x
, re.X)) # re.X = re.VERBOSE flag
def dd():
d = {}
ddPut(d, 0, 'null')
ddPut(d, 'a', 'b', 'ab')
dbg(1, 'a, b', d)
ddPut(d, 'a', 'c', 'ac')
dbg(1, 'a, c', d)
ddPut(d, 'a', 'c', 'd', 'acd')
dbg(1, 'a, c, d', d)
ddPut(d, 'a', 'c', 'ac++')
dbg(1, 'a, c', d)
dbg(1, 'ddGet a', ddGet(d, 'a'))
dbg(1, 'ddGet y', ddGet(d, 'y'))
dbg(1, 'ddGet a b', ddGet(d, 'a', 'b'))
dbg(1, 'ddGet a y', ddGet(d, 'a', 'y'))
dbg(1, 'ddGet a c d', ddGet(d, 'a', 'c', 'd'))
dbg(1, 'ddGet a c d y', ddGet(d, 'a', 'c', 'd', 'y'))
def LahIter():
iter = LahIter(['begin', 0, 1,2,3,4])
for i in iter:
print(i, iter[0], iter[1], iter[-1])
def range():
l = [0, 1, 2]
s = '012'
print([(i, l[0:i], s[0:i]) for i in range(6)])
print([(x, y) for x in range(3) for y in range(2)])
print([(x, y) for x in range(3) for y in range(3*x, 4*x+1)])
print([(x, y) for x in range(3) if x > 0 for y in range(3*x, 4*x+1) if y > x])
@staticmethod
def all():
out('Test.all', 2 * strSep1)
out('Test.re regular expressions', strSep1)
Test.re()
out('Test.dd dict dict tree', strSep1)
Test.dd()
out('Test.LahIter iterator with lookahead', strSep1)
Test.LahIter()
out('Test.range', strSep1)
Test.range()
out('Test.all end', 2 * strSep1)
if 0:
Test.all()
else:
Parser.test1()
Parser.testKnuth()
print('end', __file__)