python/parserGoLah.py
# knuth lr(k) parser
import re
def strL(*ee):
return ', '.join(e if type(e) == str else e.strL() if hasattr(type(e), 'strL') else strC(e) for e in ee)
def strS(*ee):
return ', '.join(e if type(e) == str else strC(e) for e in ee)
strCW = re.compile('\w+')
def strC(e):
"""return a str represntig e. recursively expand collections, otherwise use str"""
t = type(e)
if t == str:
return repr(e) if strCW.fullmatch(e) == None else e
elif t == tuple:
return '(' + ', '.join(strC(v) for v in e) + ')'
elif isinstance(e, list): # t == list:
return '[' + ', '.join(strC(v) for v in e) + ']'
elif t == set or t == frozenset:
return '{' + ', '. join(strC(v) for v in e) + '}'
elif isinstance(e, dict):
return '{' + ', '. join(strC(k) + ': ' + strC(v) for k, v in e.items()) + '}'
else:
return str(e)
def err(*e):
raise Exception('error: ' + strL(*e))
exit()
dbgLev = 3
dbgTime = 0
def dbg(l, *e):
if dbgLev >= l:
print('dbg: ' + strL(*e))
from datetime import * ;
import re;
def dsUpdate(le, ri):
u = False
for k,v in ri.items():
if k not in le:
le[k] = set()
o = len(le[k])
le[k] |= ri[k]
if o!= len(le[k]):
u = True
return u
def dsAdd(g, e, p):
if e in g:
s = g[e]
o = len(s)
s.add(p)
return o < len(s)
else:
g[e] = {p}
return True
def dsAddS(g, e, s):
if len(s) < 1:
return False
if e in g:
o = len(g[e])
g[e] |= s
return len(g[e]) > o
else:
g[e] = s.copy()
return True
class Leaf(dict):
pass
def d2sTg(g, *kk):
a = g
for kx in range(len(kk)-2):
k = kk[kx]
if not k in a:
a[k] = {}
a = a[k]
k = kk[-2]
if not k in a:
a[k] = Leaf()
a = a[k]
k = kk[-1]
if k not in a:
a[k] = set()
return a[k]
def d2sAdd(g, e, f, p):
s = d2sTg(g, e, f)
o = len(s)
s.add(p)
return o < len(s)
def d2sAddS(g, e, f, pp):
s = d2sTg(g, e, f)
o = len(s)
s |= pp
return o < len(s)
def dsDbg(lv, g, *a):
if dbgLev < lv:
return
if len(a) > 0:
print(*a)
for k,v in g.items():
print("\t" + str(k) + ' => ' + ', '.join({str(p) for p in v}))
class PObj:
"root and container for all our objects"
end = '!'
rStart = 'start'
lahStr = True # lah is true ==> a string, False => tuple of strings
lahEmpty = '' if lahStr else ()
objs = []
lrK = 0
def setName(self, nm):
self.name = (self.__class__.__name__ + '#' + str(len(self.objs))) if nm == '' else nm
def __init__(self, nm = ''):
self.setName(nm)
self.objs.append(self)
def __str__ (self):
return self.name
def strA(self, *aa):
return str(self) + '{' + ', '.join(a + '=' + strC(getattr(self, a)) if (ex := a.find('=')) < 0 else a[0:ex] + '=' + strC(getattr(self, a[ex+1:])) for a in aa) + '}'
@staticmethod
def dbg(l, m):
err('wer braucht das')
if dbgLev >= l:
print(m)
p = '['
for o in PObj.objs:
print(p, o)
p = ','
print(']')
class Rule (PObj):
"a Rule has a name (a symbol or nonterminal) and a body of terminals and nonterminals"
rules = []
s2r = {}
def __init__(self, sy, bd):
super().__init__(sy)
self.body = bd
self.pos = None
self.go = False
self.brd = False
self.redCh = {self}
self.sub = {}
self.subRi = {self}
self.prd = set()
self.lah = set()
self.st = set()
self.tst = False
Rule.rules.append(self)
if sy != '':
if sy in Rule.s2r:
Rule.s2r[sy].append(self)
else:
Rule.s2r[sy] = [self]
def strL(self):
return self.strA(type(self).__name__ + '=body', 'go')
def dbg(self, l, *a):
if dbgLev >= l:
print(self.strL(), *a)
@staticmethod
def clear():
Rule.rules.clear()
Rule.s2r.clear()
State.clear()
PObj.objs.clear()
@staticmethod
def add(*src):
"""add new rules: each rule is a string
elements are words (double or sinble quoted strings, spechial char
if an unquoted element is found as a rule name, it designates a rule, otherwise a terminal
"""
for s in src:
spl = re.findall('\w+|\'(?:[^\']|\'\')*\'?|"(?:[^"]|"")*"?|\S', s) # words | single quoted | doubble quoted | single nonspace char
# (?: non capturing group, otherwise findall will use group!
if len(spl) > 0 and spl[-1][0] in "\"'" and (len(spl[-1]) < 2 or spl[-1][0] != spl[-1][-1]):
err('missing endquote string:', spl[-1], 'in source', s, src)
assert(spl[1] == '=')
RuleSeq(spl[0], spl[2:])
@staticmethod
def finish():
rr = Rule.rules
Rule.rules = []
r0 = RuleSeq('', [None, PObj.end])
r0.setName('@@0')
for r in rr:
if type(Rule.s2r[r.name]) == list:
Rule.s2r[r.name] = r if len(Rule.s2r[r.name]) == 1 else RuleOr(r.name + '@' + str(len(Rule.rules)), Rule.s2r[r.name])
r.setName(r.name + '@' + str(len(Rule.rules)))
Rule.rules.append(r)
r0.body[0] = Rule.rules[1]
for r in Rule.rules:
for i in range(len(r.body)):
b = r.body[i]
if type(b) == str:
if b[0] == "'" and b[-1] == "'" or b[0] == '"' and b[-1] == '"':
b = b[1: -1]
elif b in Rule.s2r:
b = Rule.s2r[b]
r.body[i] = b
assert b != PObj.end or r == Rule.rules[0]
r.makePos()
r.dbg(1, '+pos len', len(r.body) , len(r.pos), r.strA('body', 'pos'))
for r in Rule.rules:
assert r.go == False
g = {}
r.goGen(g)
assert r.go == False
for e,p in g.items():
g[e] = frozenset(p)
r.go = g
dbg(1, 'goGen', r.strA('body', 'go'))
return
if False:
Rule.prdLah(1)
for r in Rule.rules:
bd = r.body
if isinstance(r, RuleOr):
r.redCh |= set(b for b in bd if isinstance(b, Rule))
else:
ne = [b for b in bd if isinstance(b, str) or '' not in b.prd]
if len(ne) == 0:
r.redCh |= set(b for b in bd if isinstance(b, Rule))
elif len(ne) == 1 and isinstance(ne[0], Rule):
r.redCh.add(ne[0])
u = True
while u:
u = False
for r in Rule.rules:
o = r.redCh.copy()
for s in o:
r.redCh |= s.redCh
if len(o) < len(r.redCh):
u = True
dbg(99, 'redCh', r.strA('redCh'))
for r in Rule.rules:
dbg(1, 'redCh', r.strA('body', 'redCh'))
for r in Rule.rules:
r.sub = {r for r in r.body if isinstance(r, Rule)}
u = True
while u:
u = False
for r in Rule.rules:
oo = r.sub.copy()
for o in oo:
r.sub |= o.sub
if len(oo) < len(r.sub):
u = True
dbg(99, 'sub', r.strA('sub'))
for r in Rule.rules:
dbg(1, 'sub', r.strA('body', 'sub'))
for r in Rule.rules:
bd = r.body
if isinstance(r, RuleOr):
r.subRi |= set(b for b in bd if isinstance(b, Rule))
else:
for b in reversed(bd):
if isinstance(b, str):
break
r.subRi.add(b)
if '' not in b.prd:
break
u = True
while u:
u = False
for r in Rule.rules:
o = r.subRi.copy()
for s in o:
r.subRi |= s.subRi
if len(o) < len(r.subRi):
u = True
print('subRi', r.strA('subRi'))
if False:
for r in Rule.rules:
assert r.brd == False
b = set()
r.brdGen(b)
assert r.brd == False
r.brd = b
dbg(1, 'brdGen', r.strA('body', 'brd'))
for r in Rule.rules:
if isinstance(r, RuleSeq):
bd = r.body
bL = len(bd)
for bx in range(bL-1):
b = bd[bx]
if isinstance(b, Rule) and r not in b.redCh:
for nx in range(bx+1, bL):
n = bd[nx]
if isinstance(n, str):
if n in b.brd:
err('in', r.strA('body'), 'subrule', b.strA('brd'), 'border contains follower', n, nx)
break
else:
if len(b.brd & (n.prd | n.lah)) != 0:
err('in', r.strA('body'), 'subrule', b.strA('brd'), 'border contains follower', nx, n.stra('prd', 'lah'))
if '' not in n.prd:
break
# for r in Rule.rules:
# for p in r.pos:
# g = {}
# p.goAdd(g)
# dsDbg(1, g, str(p))
if False:
for s in State.p2s.values():
if '' in s.go and len(s.go['']) != 1:
err("reduce not unique go['']", s.go[''], 'in', s)
dbg(1, s)
u = True
while u:
u = False
for s in State.p2s.values():
u |= s.auGen()
dbg(1, 'auGen')
for s in State.p2s.values():
dbg(1, 'auGen', s.strA('pa', 'au'))
stp = set()
for s in State.p2s.values():
stp.clear()
s.check(stp)
r2s = {}
s2r = {}
for s in State.p2s.values():
if '' in s.go:
for r in s.go['']:
dsAdd(r2s, r, s)
dsAdd(s2r, s, r)
dbg(1, 'r2s st',sum(len(s) for s in r2s.values()), r2s)
for ix in range(4):
for s in State.p2s.values():
for a, t in s.go.items():
if isinstance(a, Rule):
if t in s2r:
for w in s2r[t]:
dsAdd(r2s, w, s)
dsAdd(s2r, s, w)
dbg(1, 'r2s', ix, sum(len(s) for s in r2s.values()), 'r2s', r2s)
# for r in s.go['']:
# dsAdd(s2r, s,r.redP.add(r.pos[-1])
#for r in Rule.rules:
# print(str(r), 'redP', strC(r.redP))
def makePos(self, l):
assert(self.pos == None)
self.pos = [RulePos(self, ix) for ix in range(l)]
def goGen(self, g):
""" recursively add all start transactions for self to go-dict g
"""
if self.go == True:
return
elif self.go != False:
dsUpdate(g, self.go)
else:
self.go = True
self.goGenRec(g)
self.go = False
@staticmethod
def goGenRecEP(e, p, g):
""" add element e to position p to go-dict g
if e was not yet in g and g is a Rule, recursively generate go for g
"""
dbg(99, 'goGenRecEP', str(e), str(p))
if e in g:
g[e].add(p)
else:
g[e] = {p}
if isinstance(e, Rule):
e.goGen(g)
def brdGen(self, b):
""" recursively add all start transactions for self to go-dict g
"""
if self.brd == True:
return
elif self.brd != False:
b |= self.brd
else:
self.brd = True
self.brdGenRec(b)
self.brd = False
@staticmethod
def makeSymPos():
"make symbols and positions, after we got the source of all rules, convert body to symbols and bare strings"
PObj.objs[0].body[0] = PObj.objs[1].sym
Rule.rules = Rule.objs.copy()
PObj.dbg(4, 'made rules')
RuleSym.make()
PObj.dbg(4, 'made sym')
RulePos.make()
PObj.dbg(3, 'made pos')
@staticmethod
def prdLah(k):
""" compute for all rules
prd = productions of length <= k and
lah = prefixes of length k of all productions with length > k
"""
tS = datetime.now()
assert not PObj.lahStr or (all([len(y) == 1 for r in Rule.rules for y in r.body if isinstance(y, str)]))
cnt = 0
upd = True
while upd: # recompute all rules, until no change
upd = False
lC = 0
pC = 0
for r in Rule.rules:
cnt += 1
if r.prdLahComb(k):
upd = True
dbg(1, 'prdLah', str(r), r.prd, r.lah)
lC += len(r.lah)
pC += len(r.prd)
if cnt > 1000:
err('loop')
elT = f"{(datetime.now() - tS).total_seconds()}sec, " if dbgTime else ''
print(f"Rule.prdLah k={k} <- {PObj.lrK}: {cnt} {elT} #prd={pC} #lah={lC}")
if dbgLev >= 3:
for r in Rule.rules:
dbg(3, 'prdLah', r.strA('body', 'prd', 'lah'))
assert all([len(s) == k for s in r.lah for r in Rule.rules])
assert all([len(s) <= k for s in r.prd for r in Rule.rules])
PObj.lrK = k
def prdLahAdd(self, k, pp, ll):
upd = False
assert self.prd <= pp
if self.prd != pp:
assert self.prd < pp
upd = True
self.prd = pp
ll |= self.lah
jj = {len(l) for l in ll if len(l) != k}
if len(jj) != 0:
for l in [l for l in ll if len(l) == k]:
for j in jj:
ll.discard(l[0:j])
if ll != self.lah:
upd = True
dbg(99, 'lahupd', self.lah, ll)
self.lah = ll
return upd
def hasCycle(self):
if self.tst:
return True
self.tst = True
r = any(b.hasCycle() for b in self.body if isinstance(b, Rule))
self.tst = False
return r
class RuleSeq(Rule):
def __init__(self, sym, bd):
super().__init__(sym, bd)
def makePos(self):
super().makePos(len(self.body)+1)
def goGenRec(self, g):
if 0 == len(self.body):
dsAdd(g, '', self)
else:
Rule.goGenRecEP(self.body[0], self.pos[1], g)
def prdLahComb(self, k):
"compute lah and prd combining rules and already computed prd and lah"
if len(self.body) == 0:
return self.prdLahAdd(k, { PObj.lahEmpty }, set())
b = self.body[0]
pp = { b if PObj.lahStr else (b,) } if isinstance(b, str) else b.prd.copy()
ll = set() if isinstance(b, str) else b.lah.copy()
for b in (self.body[bx] for bx in range(1, len(self.body))):
n = set()
if isinstance(b, str):
for p in pp:
if len(p) < k :
n.add(p+b)
else:
ll.add((p+b)[0:k])
else:
for p in pp:
for q in b.prd:
a = p + q
if len(a) <= k:
n.add(a)
else:
ll.add(a[0:k])
for q in b.lah:
a = p + q
if len(a) >= k:
ll.add(a[0:k])
pp = n
if len(pp) == 0:
break
return self.prdLahAdd(k, pp, ll)
def brdGenRec(self, b):
bd = self.body
bL = len(bd)
if len(bd) == 0:
return
e = bd[0]
if isinstance(e, Rule) and self in e.redCh and bL > 1:
for fx in range(0, bL):
f = bd[fx]
if isinstance(f, Rule) and self in f.sub:
g = bd[1]
if isinstance(g, str):
b.add(g)
else:
b |= {p for p in g.prd if p != ''} | g.lah
break
for ex in range(len(self.body)-1, -1, -1):
e = self.body[ex]
if isinstance(e, str):
break
e.brdGen(b)
if '' not in e.prd:
break
return
for ex in range(len(self.body)-1):
e = self.body[ex]
if isinstance(e, str):
break
if e in self.redCh or self in e.redCh:
f = self.body[ex+1]
if isinstance(f, str):
b.add(f)
else:
b |= {x for x in f.prd if len(x) != 0} | f.lah
assert all(len(x) == 1 for x in b)
if '' not in e.prd:
break
assert not self.tst
e = self.body[0]
if isinstance(e, Rule) and '' in e.prd:
for ex in range(len(self.body)-2, -1, -1):
e = self.body[ex]
if isinstance(e, str):
break
if self in e.subRi:
dbg(1, 'hasCycle', str(self), str(e))
b |= {x for x in e.prd if len(x) > 0} | e.lah
if '' not in e.prd:
break
class RuleOr(Rule):
def __init__(self, sym, bd):
super().__init__(sym, bd)
def makePos(self):
super().makePos(2)
def goGenRec(self, g):
for b in self.body:
Rule.goGenRecEP(b, self.pos[1], g)
def prdLahComb(self, k):
"compute lah and prd combining rules and already computed prd and lah"
pp = set()
ll = set()
for y in self.body:
if isinstance(y, str):
pp.add( y if PObj.lahStr else (y,))
else:
pp |= y.prd
ll |= y.lah
return self.prdLahAdd(k, pp, ll)
def brdGenRec(self, b):
for e in self.body:
e.brdGen(b)
class RuleSymRemove (PObj):
"the symbols are our nonterminals, which are a set of rules"
n2y = {}
syms = None
def __init__(self, sym):
self.sym = sym
super().__init__(f"Y#{sym}")
self.rules = []
self.lah = set()
def sExp (self):
return super().sExp(f"sym={self.sym!s} {self.rules} lah={self.lah}")
@staticmethod
def make():
"make the symbols from defined rules and convert Rule.body to symbols and bare strings"
i0 = len(PObj.objs)
for r in Rule.rules: #collect symbols
if r.sym not in RuleSym.n2y:
RuleSym.n2y[r.sym] = RuleSym(r.sym)
RuleSym.n2y[r.sym].rules.append(r)
r.sym = RuleSym.n2y[r.sym]
RuleSym.syms = PObj.objs[i0:]
assert Rule.objs[0].sym == RuleSym.syms[0]
assert RuleSym.syms[0].sym == PObj.rStart
assert RuleSym.n2y[PObj.rStart] == RuleSym.syms[0]
assert len(RuleSym.n2y[PObj.rStart].rules) == 1
for r in Rule.rules: # convert Rule.body to symbols and bare strings
b = r.body
if r.pos == 'a':
for ix in range(len(b)):
v = b[ix]
if v[0] == "'" and v[-1] == "'":
b[ix] = v[1: len(v)-1]
assert b[ix] != PObj.end
elif v in RuleSym.n2y:
b[ix] = RuleSym.n2y[v]
else:
err(f"rule {v} in {r} not in {Rule.rules}")
elif r.pos == 'n':
for ix in range(len(b)):
v = b[ix]
if v in RuleSym.n2y:
b[ix] = RuleSym.n2y[v]
else:
err(f'bad pos {r.pos} in {r.sExp()}')
class RulePos (PObj):
def __init__(self, r, p):
super().__init__(r.name + '#' + str(p))
self.rule = r
self.pos = p
def strL(self):
return self.strA('rule', 'pos')
def goAdd(self, g):
if self.pos == 0:
dsUpdate(g, self.rule.go)
elif self.pos + 1 == len(self.rule.pos):
dsAdd(g, '', self.rule)
else:
b = self.rule.body[self.pos]
dsAdd(g, b, self.rule.pos[self.pos + 1])
if isinstance(b, Rule):
dsUpdate(g, b.go)
@staticmethod
def makeLah(k):
# compute lah for RuleSym and RulePos using lah from Rule
# lah = production of len <= k and prefix of len k of all production longer k
Rule.prdLah(k)
for y in RuleSym.syms:
y.lah = set().union(*[r.lah | r.prd for r in y.rules])
dbg(3, f"{y} {y.lah}")
for r in Rule.rules:
r.pos[-1].lah = {lahEmpty}
for pX in range(len(r.body)-1, -1, -1): # backward through the positions
p = r.pos[pX]
e = r.body[pX]
lS = e.lah if isinstance(e, RuleSym) else {e if PObj.lahStr else (e,)}
n = set()
for l in lS:
if len(l) == k:
n.add(l)
else:
for o in p.next.lah:
dbg(9, f"n.add(({l}+{o})[0:k])")
n.add((l+o)[0:k])
p.lah = n
dbg(3, f"{r} {pX} {p} {p.lah}")
assert r.pos[0].lah == r.prd | r.lah, (f"rule {r.sExp()} fails \npos[0].lah={r.pos[0].lah}"
f" \n== prd={r.prd} \n| lah={r.lah}"
f",\ndifference lah-..={r.pos[0].lah-r.prd-r.lah}, ..-lah={(r.prd|r.lah)-r.pos[0].lah}")
class State (PObj):
p2s = {}
state0 = None
lrErr = None
def __init__(self, pa):
super().__init__('s' + str(len(State.p2s)))
assert isinstance(pa, frozenset)
assert all(isinstance(p, RulePos) for p in pa)
assert pa not in State.p2s
self.pa = pa
self.go = None
self.done = False
self.au = {}
self.lah = {}
self.lahOne = {}
State.p2s[pa] = self
@staticmethod
def getOrMake(pa):
pa = frozenset(pa)
return State.p2s[pa] if pa in State.p2s else State(pa)
@staticmethod
def makeAll():
State.state0 = State(frozenset([Rule.rules[0].pos[0]]))
State.state0.goExp()
State.err = ''
for s in State.p2s.values():
dbg(1, 'goExp', s.strA('pa', 'go'))
for s in State.p2s.values():
for r in s.go.keys():
if isinstance(r, Rule):
r.st.add(s)
for r in Rule.rules:
dbg(1, 'stGen', r.strA('body', 'st'))
State.lahGen()
if State.err == '':
for s in State.p2s.values():
s.goFix()
def strL(self):
return self.strA('pa', 'go')
def dbg(self, lv):
if lv > dbgLev:
return
print(str(self), ':', ', '.join(str(p) for p in self.pa))
for e, s in self.go.items():
print('\t', e, ('=> ' + str(s)) if isinstance(s, State) else '<= ' + ', '.join(str(r) for r in s))
def goExp(self):
if self.go != None:
return
self.go = {}
for p in self.pa:
p.goAdd(self.go)
for e, pp in self.go.items():
if e != '':
s = State.getOrMake(pp)
self.go[e] = s
s.goExp()
def goFix(self):
if '' not in self.go:
return
dbg(1, 'goFix begin', self.strA('pa', 'go', 'lah'))
self.goFix1(self.go[''], self.go, self.lah)
dbg(1, 'goFix end ', self.strA('pa', 'go'))
def goFix1(self, rr, gg, ll):
if len(rr) == 1:
for r in rr: break
gg[''] = r
else:
assert len(rr) > 1
cnt = {r: 0 for r in rr}
for a,b in ll.items():
if isinstance(b, Leaf):
assert len(b) == 1
for k in b.keys():
if isinstance(k, Rule):
cnt[k] += 1
dbg(1, 'goFix1 cnt', cnt)
r = max(cnt.items(), key = lambda i: i[1])
dbg(1, 'goFix1 max', r)
r = r[0]
# if r[1] < 1:
# del gg['']
# else:
gg[''] = r
for a,b in ll.items():
if isinstance(b, Leaf):
assert len(b) == 1
if a != '':
for k in b.keys(): break
if k == '':
if sTo != None:
gg[a] = sTo
else:
assert a in gg
elif k != r:
gg[a] = k
elif a in gg:
del gg[a]
dbg(1, 'goFix1 ed', a, gg)
for a,b in ll.items():
if type(b) == dict:
dbg(1, 'goFix1 diving into', a, 'from', gg[a] if a in gg else '---', 'to', b)
gg[a] = {}
self.goFix1(rr, gg[a], b)
def addPosA(self, pa):
"add (position, the string after the the end of the current rule) to self, recursively add subrules"
p, a = pa
if State.lrErr != None:
return
if p.next == None: #after all positions of rule ==> reduce
if a not in self.to:
self.to[a] = p.rule
elif self.to[a] != p.rule:
State.lrErr = strL("lrK mismatch [", a, "]=", self.to[a])
else:
e = p.rule.body[p.pos]
k = PObj.lrK
t = (p.next, a)
if isinstance(e, str): #shift character on stack with next state
for l in p.lah:
assert l[0] == e
if len(l) != k:
l = (l+a)[0:k]
if l not in self.to:
self.to[l] = {t}
elif not isinstance(self.to[l], set):
State.lrErr = f"lrK mismatch [{l}]->{self.to[l]} cannot add {t}"
return
else:
self.to[l].add(t)
else:
assert isinstance(e, RuleSym)
if e not in self.to: # next state after this symbol
self.to[e] = {t}
elif t in self.to[e]:
return
else:
self.to[e].add(t)
for n in p.next.lah: # enter pos[0] of subrules, string after is lah of next position of rule
if len(n) != k:
n = (n+a)[0:k]
for r in e.rules:
self.addPosA((r.pos[0], n))
if State.lrErr != None:
return
def addTrans(self):
"modify to: find/create states for (pos, aft) sets in to.values"
for k, v in self.to.items():
if isinstance(v, Rule):
continue
pa = frozenset(v)
if pa in self.pa2u:
self.to[k] = self.pa2u[pa]
elif State.lrErr != None:
return
else:
n = type(self)(pa)
self.to[k] = n
n.addTrans()
@staticmethod
def clear():
State.states = []
State.p2s.clear()
@staticmethod
def lahGen():
for s in State.p2s.values():
s.lahGen1()
u = True
while u:
u = False
for s in State.p2s.values():
u |= s.lahGen2()
dbg(1, 'lahGen')
for s in State.p2s.values():
for a, l in s.lah.items():
for q, r in l.items():
dsAddS(s.lahOne, a, r)
dbg(1, 'lahGen', s.strA('pa', 'lah', 'lahOne'))
for s in State.p2s.values():
s.lahExp(9)
def lahGen1(self):
for a, s in self.go.items():
if isinstance(a, str) and a != '':
d2sAdd(self.lah, a, s, s)
def lahGen2(self):
u = False
if '' in self.go:
for r in self.go['']:
for s in r.st:
for e, eD in s.go[r].lah.items():
for pp in eD.values():
if e != PObj.end or isinstance(r, Rule):
u |= d2sAddS(self.lah, e, r, pp)
return u
def lahExp(self, maxi):
self.lahExp1(maxi, self.lah, [])
def lahExp1(self, maxi, dct, kys):
for a, e in dct.items():
if a == PObj.end:
dbg(1, 'exp1 end', e, kys)
elif len(e) > 1:
dbg(1, 'lahExp1 conflict ', 'expanding' if maxi > 0 else 'unresolved', ' in', self.strA('pa'), 'at lah', kys + [a], ':', e)
if maxi > 0:
self.lahExp2(maxi-1, dct, kys + [a])
elif State.err == '':
State.err = strS('lahExp1 conflict unresolved in', self.strA('pa'), 'at lah', kys + [a], ':', e)
def lahExp2(self, maxi, dct, kys):
oo = dct[kys[-1]]
assert isinstance(oo, Leaf)
nn = {}
dct[kys[-1]] = nn
for r, ss in oo.items():
dbg(1, 'exp2a', str(r), ss)
for s in ss:
for u, v in s.lahOne.items():
if u != PObj.end or isinstance(r, Rule):
d2sAddS(nn, u, r, v)
dbg(1, 'exp2z', nn)
if kys[-1] == PObj.end:
dbg(1, 'exp2zend', nn)
dbg(1, 'expanded', self.strA('pa'), 'from', oo, 'to', dct)
self.lahExp1(maxi, nn, kys)
def auGen(self):
if '' not in self.go:
return False
n = self.au.copy()
for a,s in self.go.items():
if isinstance(a, str) and a != '':
dsAdd(n,a,s)
u = False
for r in self.go['']:
for s in r.st:
for a, b in n.items():
if dsAddS(s.au, a, b - {s}):
u = True
dbg(1, 'auGen0', s.strA('au'))
return u
hist = []
done = []
def check(self, stp):
if self.done:
#dbg(1, 'check0-', State.lv, self.strA('done', 'go'))
return
l = len(State.hist)
if l >= len(State.done):
if l > len(State.done):
err('done smaller', State.done, State.hist)
State.done.append({self})
else:
if self in State.done[l]:
return
else:
State.done[l].add(self)
l += 1
if len(State.done) < l:
State.done[l].clear()
assert self.done == False
self.done = True
State.hist.append(self)
dbg(1, 'check1', len(State.hist), State.hist, self.strA('go'), stp)
gg = {g for g in self.go.keys() if isinstance(g, str) and g != ''}
if len(gg & stp) != 0:
dbg(1, 'check dead end', gg, self.strA('pa', 'go'), State.hist)
if '' in self.go:
if len(self.go['']) > 1:
dbg(1, "check several go['']", self.strA('pa', 'go'))
for g in self.go['']:
#dbg(1, "check2", str(g), g.st)
for s in g.st:
s.chec2(stp | gg, g)
self.done = False
# dbg(1, 'check9-', my, self.strA('done'))
State.hist.pop()
def chec2(self, stp, ru):
assert ru in self.go
dbg(1, 'chec2=', str(self), str(ru), str(self.go[ru]))
self.go[ru].check(stp)
@staticmethod
def makeLookahead(k):
"make state for LR(k) grammar, return errormessage if not LR(k)"
assert k >= 1
State.clear()
RulePos.makeLah(k)
State.st0 = len(PObj.objs)
State.lrErr = None
b = State(frozenset([(Rule.rules[0].pos[0], k * (PObj.end if PObj.lahStr else (PObj.end, )))]))
e = State(frozenset(b.to[RuleSym.syms[1]]))
b.addTrans()
e.addTrans()
State.states = PObj.objs[State.st0: ]
Parser.states = State.states
if State.lrErr != None:
return f"grammar not LR({k}), because of {State.lrErr}"
@staticmethod
def checkLR0():
""" check if the current LR(1) grammar is even LR(0)
return None if it is, otherwsie a string describing the conflicting state
LR(0) grammars are implemented using LR(1) grammars
LR(0) means, that a reduce step does not depend on lookahead.
a shift step however uses the char shifted (== lookahead) to determine the next state """
assert PObj.lrK == 1
for u in State.states:
assert len(u.to) > 0
fi = None
for k,v in u.to.items():
if not isinstance(k, RuleSym):
if fi == None:
fi = v
elif v != fi if isinstance(v, Rule) else not (isinstance(v, State) and isinstance(fi, State)):
return strL("grammar not LR(0) because of", u)
def eStr(e):
if type(e) != tuple and type(e) != list:
return str(e)
else:
return str(e[0]) + '>' + str(len(e)-1) + '(' + ', '.join(x if type(x) != tuple and type(x) != list else str(x[0]) + '>' + str(len(x)-1) for x in e[1:]) + ')'
class LahIter():
def __init__(self, l):
self.list = l
self.ix = 0
def __iter__(self):
return self
def __next__(self):
self.ix += 1
if self.ix <= len(self.list):
return self.list[self.ix - 1]
raise StopIteration()
def __getitem__(self, i):
i += self.ix
return self.list[i] if i < len(self.list) else PObj.end
@staticmethod
def test():
print('LahIter test begin')
iter = LahIter(['begin', 0, 1,2,3,4])
for i in iter:
print(i, iter[0], iter[1], iter[-1])
print('LahIter test end')
class Parser:
pCnt = -1
@staticmethod
def parse(inp):
assert len(State.state0.pa) == 1
for r0 in State.state0.pa: break
assert r0.pos == 0 and r0.rule == Rule.rules[0]
r0 = r0.rule
dbg(1, 'parsing for', r0)
st = [(None, State.state0)]
inI = LahIter(inp)
tNr = -1
Parser.pCnt = -1
cntL = 1000
act = 'start'
shiL = st[-1][1]
try:
while True:
Parser.pCnt += 1
e,s = st[-1]
lah = inI[0]
print('parse', (str(Parser.pCnt) + ' ' + act).ljust(20), 'lah', lah, 'stck', len(st), '>', eStr(e), str(s))
if Parser.pCnt > cntL:
err("too many loops cnt={Parser.pCnt}, lah={lah} st={st}")
# dbg(2, f'run{Parser.pCnt} {ls} --> lah={lah} tos={e}, {s} st={len(st)} {st}')
to = s.go
for fx in range(20):
to = to[inI[fx]] if inI[fx] in to else to[''] if '' in to else None
if not isinstance(to, dict): break
else:
err('parser to deeply nested lah', lah, s.go)
if isinstance(to, State):
st.append((next(inI), to))
assert st[-1][0] == lah
shiL = to
tNr += 1
act = 'shift ' + lah
lah = inI[0]
elif isinstance(to, Rule):
sx = len(st) - len(to.pos) + 1
p = [to] + [x0 for x0, x1 in st[sx : ]]
act = 'reduce ' + str(to)
del st[sx:]
st.append((p, st[-1][1].go[to]))
else:
break
except StopIteration:
assert lah == PObj.end and len(st) == 2 and st[1][0][0] == Rule.rules[1]
dbg(1, 'stopIteration parse', st)
print('parsed', str(Rule.rules[1]), 'from', len(inp), 'tokens, in', Parser.pCnt, 'steps,', len(State.p2s), 'states')
return st[1][0]
print('syntax...')
print('last', inI[-1], 'tokennr', tNr, ', lah', inI[0], inI[1], inI[2])
print(' preceeding', inp[0: tNr+1])
print(' following ', iA := inp[tNr+1: ] +[ PObj.end] )
print('stack', len(st))
for sx in range(len(st)-1, -1, -1):
print(' ', sx, ':', eStr(st[sx][0]), 'state', st[sx][1])
r = f"syntax after {inI[-1]} tokenNr {tNr} expected {set(shiL.lahOne.keys())}, not lah {iA if len(iA) <= 5 else iA[0:5]}"
print(r)
return r
@staticmethod
def makeGr(*grammar):
"make Rule, Sym and Pos for given grammer in form for function f"
dbg(1, "grammar source", grammar)
PObj.lrK = 0
Rule.clear()
Rule.add(*grammar)
Rule.finish()
State.makeAll()
@staticmethod
def makeStateRemove():
"try to make lr(k) states for k=0,1, ..."
tS = datetime.now()
k = 0
st = f"{len(RuleSym.syms)} syms, {len(Rule.rules)} rules, {len(RulePos.poss)} rulePos"
while True:
k += 1
res = State.make(k)
el = (datetime.now() - tS).total_seconds()
elT = f"after {el}sec: " if dbgTime else ''
if res == None:
break
print(f"{res}: {elT}{st} {len(State.states)} states")
if el > 7:
print("stop by timeout, do not check next LR(k)")
return False
if k > 20:
print("stop by k >Y 20, do not check next LR(k)")
return
if k == 1:
res = State.checkLR0()
if res == None:
k=0
else:
print(f"{res}: {elT}")
print(f'grammar is LR({k}): {elT}{st} {len(State.states)} states')
Parser.states = State.states
PObj.dbg(1, 'State 0')
return True
@staticmethod
def testGr(nm, syn, *cs):
"""test named nm, with grammar syn and testcases cs"""
print(f"{nm} --- begin test -----------")
Parser.makeGr(*syn)
for s in State.p2s.values():
dbg(1, s.strA('pa', 'go'))
if State.err != '':
print('error generating grammar', State.err)
else:
Parser.test(nm, *cs)
@staticmethod
def test(nm, *tsts):
"test for the current grammar, named nm the tests tsts"
if tsts == None:
tsts = ['2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (']
trS = 0
def treeSplit(nd, pr):
nonlocal trS
trS += 1
if isinstance(nd, str):
return pr + nd
r = pr + str(nd[0]) + '/' + str(len(nd[0].body))
for ch in nd[1:]:
r += treeSplit(ch, pr + ' ')
return r
def treeSpli2(nd, of):
nonlocal trS
trS += 1
if isinstance(nd, str):
return nd
r = str(nd[0]) + ' ' # + '/' + str(len(nd[0].body))
of += len(r)
if len(nd) > 1:
r += treeSpli2(nd[1], of)
for ch in nd[2:]:
r += "\n" + of * ' ' + treeSpli2(ch, of)
return r
for i in range(len(tsts)) :
sT = [x for x in re.split(r'\s+', tsts[i]) if x != ''] # void empty strings, e.g. for ''
print(f"test begin {nm} {i} input {sT} ..........................")
ast = Parser.parse(sT)
if isinstance(ast, str):
print(f'syntax test {nm} {i} input {tsts[i]}: {ast}')
else:
print(f"parsed {str(ast[0])} from {len(sT)} tokens, {Parser.pCnt} steps, {len(State.p2s)} states, Rules {len(Rule.rules)}")
print(' ', treeSpli2(ast, 2))
print('test', 'syntax' if isinstance(ast, str) else 'parsed', f"{nm} {i} input {sT} ..........................")
@staticmethod
def testSyn(nm, syn, *cs):
"""test named nm, with grammar syn and testcases cs"""
Parser.testGr(nm, syn, *cs)
@staticmethod
def test1():
"""my tests """
g = ["s=s+p", "s = p", "p = p '*' i", "p = i", "i = '(' s ')'", "i = '0'", "i = '1'", "i = '2'"]
Parser.testSyn('arithExpr', g
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +')
g2 = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = epm i d", "epm = pm", "epm ="
, "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
g = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = i d", "epm = pm epm", "epm ="
, "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
Parser.testSyn('arithExprPM' , ["s=s pm p", "s=p", "p=p*i", "p=p/i", "p=i", "i=( s )", "i = j"
, "j=epm d", "j=j d", "epm = pm epm", "epm = ", "pm=+", "pm=-", "d=0", "d=1", "d = 2 "]
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
, '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )')
Parser.testSyn('arithExprPE' , ["s=s pm p", "s=p", "p=p*e", "p=p/e", "p=e", "e=i", "e=i**e", "i=( s )", " i=j"
, "j=epm d", "j=j d", "epm = pm epm", "epm = ", "pm=+", "pm=-", "d=0", "d=1", "d = 2 "]
,'2', '1 + 2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
, '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )')
Parser.testSyn('LR(2)?' , ["S = AA 'a' 'a'", "S = AB 'a' 'b'", "AA =", "AB ="]
,"a a", "a b", "a c", "b a")
@staticmethod
def testKnuth():
"""some of the examples given by Donald E. Knuth. 1965. On the Translation of Languages from Left to Right """
Parser.testSyn('knuth(1)'
, ["S = A D", "A = 'a' C", "B = 'b' 'c' 'd'", "C = B E", "D =", "E = 'e'"]
, "a b c d e", "a b x d e", "a b", "a b c d e f")
Parser.testSyn('knuth(6)'
, ["S = 'a' A 'c'", "A = 'b' A 'b'", "A = 'b'"]
, "a b b b c", "a b c", "a b b c", "a b b b b b b b c e")
Parser.testSyn('knuth(7) lr(0)'
, ["S = 'a' A 'c'", "A = A 'b' 'b'", "A = 'b'"]
, "a b b b c", "a b c", "a b b c", "a b b b b b b b c e")
Parser.testSyn('knuth(16x) lr(3)'
, ["S=B C", "B=C e", "B = ", "C=D", "C=D c", "D=", "D=d"]
, "", "c", "c e", "c e d")
Parser.testSyn('knuth(24)'
, ["S =", "S = 'a' A 'b' S", "S = 'b' B 'a' S", "A =", "A = 'a' A 'b' A", "B =", "B = 'b' B 'a' B"]
, "", "a b", "b a", "a b b", "a a b b", "a b a b", "b a a b")
# "IS = IP", "IS = IS + IP", "IS = IS - IP", "IP = IE", "IP = IP * IE", "IP = IP / IE", "IE = IA", "IE = IA * * IE", "IA = IB", "IA = + IA", "IA = - IA", "IB =( IS )", "IB = ID", "IB = IB ID", "ID = 0", "ID = 1", "ID = 2" ...
def testRe():
x = ' ab cd e?f *+??'
print(f"x={x}")
print("re: lookahead assertion and lookbehind assertions to find boundaries between words and or special character")
for y in re.finditer(r'\s+|(?<=\S)(?=\S)((?<=\W)|(?=\W))', x):
print([y.start(),y.end(), x[y.start(): y.end()+1]])
print("easier and more stable against leading/trailing spaces: use findall for words or special chars")
for y in re.finditer(r'\w+|\S', x):
print([y.start(),y.end(), x[y.start(): y.end()]])
print(re.findall(r'\w+|\S', x))
x += "' '' b 'oder1''1.2und. 3 ' '.a.234 4.. single' !"
print('mit strings and numbers:', x)
print(re.findall(r"""'(?:''|[^'])*' # single quoted strings, doubble apostrophs for a single
# non capturing group (?:...) otherwise findall will return groups!
|\d+(?:\.\d*) # number with leading digits and possible decimal point/digits
|\.\d+ # number with leading decimal points and digits
|\w+ # words
|\S""" # special chars
, x
, re.X)) # re.X = re.VERBOSE flag
if 0:
testRe()
exit()
if 0:
LahIter.test()
l = [0, 1, 2]
s = '012'
print([(i, l[0:i], s[0:i]) for i in range(6)])
print([(x, y) for x in range(3) for y in range(2)])
print([(x, y) for x in range(3) for y in range(3*x, 4*x+1)])
print([(x, y) for x in range(3) if x > 0 for y in range(3*x, 4*x+1) if y > x])
exit()
Parser.test1()
Parser.testKnuth()
print('end', __file__)