file browser

python/parserVgq.py

# knuth lr(k) parser

#---------- output error debug ----------
def out(*ee, sep = ', '):
    print(strL(*ee, sep = sep))
    
def outS(*ee, sep = ', '):
    print(strS(*ee, sep = sep))

def err(*e, sep = ', '):
    raise Exception('error: ' + strL(*e, sep = sep))
    exit()

def errS(*e, sep = ', '):
    raise Exception('error: ' + strS(*e, sep = sep))
    exit()

dbgLev = 10
def dbg(l, *e, sep = ', '):
    if dbgLev >= l:
        print('dbg: ' + strL(*e, sep = sep))

def dbgS(l, *e, sep = ', '):
    if dbgLev >= l:
        print('dbg: ' + strS(*e, sep = sep))
        import re


#---------- string representations ----------
""" use method strL for nice lont string representations """
from datetime import * ;
import re;

strSep1 = '--------------------------------'

def strL(*ee, sep = ', '):
    """ return a long stringrepresentation
            elements of ee in long (strL) representation, separated by sep
            deep dive into collections, represented by strS
    """    
    return     sep.join(e if type(e) == str else e.strL() if hasattr(type(e), 'strL') else strC(e) for e in ee)

def strS(*ee, sep = ', '):
    return     sep.join(e if type(e) == str else strC(e) for e in ee)

def strC(e):
    """return a str represntig e. recursively expand collections, otherwise use str"""
    t = type(e)
    if t == str:
         return repr(e) if strCW.fullmatch(e) == None else e
    elif t == tuple:
         return '(' + ', '.join(strC(v) for v in e) + ')'
    elif isinstance(e, list): # t == list:
         return '[' + ', '.join(strC(v) for v in e) + ']'
    elif t == set or t == frozenset:
        return '{' + strSo(e, fmt=strC) + '}'
    elif isinstance(e, dict):
        return '{' + ', '. join(strC(k) + ': ' + strC(v) for k, v in e.items()) + '}'
    else:
        return str(e)

def strSo(ee, sep=', ', fmt=str):
    return sep.join(sorted([fmt(e) for e in ee]))
    
strCW = re.compile('\w+')
strNSs = re.compile('\s')

def strNS(s):
    return strC(s) if not isinstance(s, str) else s if s != '' and strNSs.search(s) == None else repr(s)

def strNSsq(s):
    return ' '.join(strNS(e) for e in s)

def strNSset(s):
    return strNSsq(sorted(s))

#---------- ds: dictionary to set ----------
def dsAdd(g, e, p):
    """ add p to set at g[e] 
        return True if g has been changed
    """
    if e in g:
        s = g[e]
        o = len(s)
        s.add(p)
        return o < len(s)
    else:
        g[e] = {p}
        return True

def dsUnion(g, e, s):
    """ update/create g[e] with the union of s
        return True if g has been changed
    """
    if len(s) < 1:
        return False
    if e in g:
        o = len(g[e])
        g[e] |= s
        return len(g[e]) > o
    else:
        g[e] = s.copy()
        return True
def dsUpdate(le, ri):
    """ update/create le[k] with the union of ri[k]
        return True if le has been changed
    """
    u = False
    for k,v in ri.items():
        assert len(v) > 0
        if k not in le:
            le[k] = set()
        o = len(le[k])
        le[k] |= v
        if o!= len(le[k]):
            u = True
    return u

#---------- dd: dictionary tree ----------
def ddPut(d, *key):
    """ add the key sequence to tree dict d
        '' is a special key for the value at this node
    """
    e = d
    for k in key[0:-2]:
        if k not in e:
            e[k] = {}
        elif not isinstance(e[k], dict):
            e[k] = {'': e[k]}
        e = e[k]
    k, v = key[-2:]
    if k not in e or not isinstance(e[k], dict):
        e[k] = v
    else:
        e[k][''] = v
    dbg(30, 'ddPut after key=', key, 'd=', d)

def ddGet(d, *key):
    """ return the value in d at the keySequence key. or None if not present """
    e = d
    for k in key:
        if not isinstance(e, dict) or k not in e:
            return None
        e = e[k]
    return e if not isinstance(e, dict) else e[''] if '' in e else None

def ddRed(d, dflt = None):
    """ reduce tree d: if all nodes point to the same value replace it by the single value 
        if dflt != None, remove all entries with value dflt
    """
    hasDi = False
    for k, v in d.items():
        if isinstance(v, dict):
            v = ddRed(v, dflt)
            d[k] = v
            hasDi |= isinstance(v, dict)
    dbg(20, 'ddRed di', hasDi)
    if not hasDi:
        vv = set(d.values())
        dbg(20, 'ddRed vv', vv)
        if len(vv) == 1:
            return anyOf(vv)
    if dflt != None:
        assert '' not in d or dflt == d['']
        for k in [k for k,v in d.items() if v == dflt and k != '']:
            del d[k]
        d[''] = dflt                
    return d
            
#---------- div helpers ----------
def anyOf(c):
    """ return an arbitrary Element of a collection or iterator """
    for a in c:
        return a
    err('anyOf empty', c)
    
#---------- PObj: the root class ----------
class PObj:
    "root and container for all our objects"
    end = '!'
    sqStr = True # a sequence of nonterminals is representend as: a single string if sqStr is True else a tuple of strings
    sqEmpty = '' if sqStr else ()
    sqEnd = end if sqStr else (end, )
    lrK = 0

    def setName(self, nm):
        self.name = nm

    def __init__(self, nm = ''):
        self.setName(nm)

    def __str__ (self):
        return self.name

    def strA(self, *aa):
        return str(self) + '{' + ', '.join(a + '=' + strC(getattr(self, a)) if (ex := a.find('=')) < 0 else a[0:ex] + '=' + strC(getattr(self, a[ex+1:])) for a in aa) + '}'

#---------- Rule: grammar rule ----------
class Rule (PObj):
    """ a Rule has a name (a symbol or nonterminal) and a body of terminals and nonterminals """
    rules = []
    ruleAll = []
    s2r = {}
    def __init__(self, sy, bd):
        """ ini and add to ruleAll and s2r """
        super().__init__(sy)
        self.body = bd
        self.pos = None
        self.gr = None
        self.gq = False
        self.suq = set()

        Rule.ruleAll.append(self)
        if sy != '':
            if sy in Rule.s2r:
                Rule.s2r[sy].append(self)
            else:
                Rule.s2r[sy] = [self]

    def strA(self, *aa):
        return super().strA(type(self).__name__[4:] + '=' + aa[0] if aa[0].find('=') < 0 else aa[0], *aa[1:])

    def strL(self):
        return self.strA('body', 'gr')
        
    @staticmethod
    def clear(): 
        PObj.lrK = 0
        Rule.rules.clear()
        Rule.ruleAll.clear()
        Rule.s2r.clear()
        State.clear()

    @staticmethod
    def add(*src):
        """add new rules: each rule is a string
                elements are words (double or sinble quoted strings, spechial char
            if an unquoted element is found as a rule name, it designates a rule, otherwise a terminal
        """
        for s in src:
            spl = re.findall('\w+|\'(?:[^\']|\'\')*\'?|"(?:[^"]|"")*"?|\S', s)     # words | single quoted | doubble quoted | single nonspace char
                                                                                # (?: non capturing group, otherwise findall will use group!
            if len(spl) > 0 and spl[-1][0] in "\"'" and (len(spl[-1]) < 2 or spl[-1][0] != spl[-1][-1]):
                err('missing endquote string:', spl[-1], 'in source', s, src)
            assert(spl[1] == '=')
            RuleSeq(spl[0], spl[2:])


    def makePos(self, l):
        """ make positions for this rule """
        assert(self.pos == None)
        self.pos = [RulePos(self, ix) for ix in range(l)]
        dsAdd(self.pos[-1].prd, PObj.sqEmpty, self)
        
    def gqGen(self):
        assert self.gq == False
        g = {}
        self.gqGe2(g, [])
        assert self.gq == False
        self.gq = g
        if isinstance(self, RuleOr):
            for b in self.body:
                if isinstance(b, Rule):
                    b.suq.add(self)
        else:
            l = len(self.body)
            for bx in range(l):
                b = self.body[bx]
                if isinstance(b, Rule):
                    b.suq.add(self if bx == l-1 else self.pos[bx+1])
                
    def gqGe2(self, gg, h):
        if self.gq == True:
            return
        elif self.gq != False:
            hT = tuple(h)
            for a, tt in self.gq.items():
                for t in tt:
                    dsAdd(gg, a, hT+t)
        elif len(self.pos) <= 1:
            dsAdd(gg, '', tuple(h+[self]))
        else:
            self.gq = True
            h.append(self.pos[1])
            self.gqGe3(gg, h)
            h.pop()
            assert self.gq == True
            self.gq = False

    def gqExt(self):
        if '' not in self.gq:
            return
        for p in self.gq['']:
            if self.gqEx2(['bot', p if isinstance(p, RulePos) else p.pos[-1]], self.gq):
                dbg(1, 'gqEx2 True')


    def gqEx2(self, h, to):
        p = h[-1]
        assert isinstance(p, RulePos)
        if p.pos + 1 == len(p.rule.pos):
            return True
        assert isinstance(RuleSeq)
        b = p.rule.body[p.pos]
        n = p.rule.pos[p.pos+1]
        h[-1] = n
        if isinstance(b, str):
            dsAdd(to, b, tuple(h))
        elif isinstance(b, RuleSeq):
            h.append(b.pos[0])
            res = self.gqEx2(tuple(h), to)
            h.pop()
        else:
            res = False
            h.append(0)
            for b in b.body:
                h[-1] = 0
                res |=    self.gqEx2(tuple(h), to)
            h.pop()
        h[-1] = p
        return res
            
    def grGen(self):
        """ generate gr (left subrules, recursively) for the gr already computed
            return True/False if an update needs a further round of grGen
        """
        u = False
        for r in list(self.gr.keys()):
            u |= dsUpdate(self.gr, r.gr)
        return u

    @staticmethod
    def prdGen():
        u = True
        c = 0
        while u:
            c += 1
            if c > 40:
                err('too many')
            u = False
            for r in reversed(Rule.rules):
                u |= r.prdGen()
            dbg(15, 'prdGen round')
        dbg(3, 'prdGen', c, 'rounds')

    @staticmethod
    def lahGen():
        for r in Rule.rules:
            for p in r.pos:
                p.lah.clear()
                dsUpdate(p.lah, p.prB)
        dsAdd(Rule.rules[0].pos[-1].lah, PObj.sqEmpty, Rule.rules[0])
        u = True
        lC = 0
        uC = 1
        while uC > 0:
            lC += 1
            uC = 0
            for r in Rule.rules:
                uC += r.lahGen()
            dbg(5, f'lahGen loop {lC}, upd {uC}')

    @staticmethod
    def genK(k):
        """ generate lookAheads for lr=K """
        if k <= PObj.lrK:
            err(f"genK({k} bur lrK already {PObj.lrK}")
        PObj.lrK = k
        Rule.prdGen()
        Rule.lahGen()
        if k == 1:
            for r in Rule.rules:
                r.gqGen()
            for r in Rule.rules:
                dbg(10, 'gqGen', r.strA('body', 'gq', 'suq'))
                oo = {}
                dsUpdate(oo, r.gq)
                r.gqExt()
                if oo != r.gq:
                    dbg(10, 'gqExt', r.strA('body', 'gq', 'suq'))

        if dbgLev >= 10:    
            for r in Rule.rules:
                dbg(10, 'gen', r.strA('body'), '\n\t' + '\n\t'.join(p.strA('prd', 'prB', 'lah') for p in r.pos))

    @staticmethod
    def finish():
        """ after all rules are added, convert them to the final form and generate lrK=1 lookaheads """
        #--- add RuleOr for nonterminals withe several alternatives and put them to Rule.rules 
        assert len(Rule.rules) == 0
        for r in Rule.ruleAll.copy():
            l = Rule.s2r[r.name] 
            if type(l) != list:
                assert type(l) == RuleOr and r in l.body
            else:
                assert type(l) == list and r in l, r.strA('body') + ', l=' + strC(l)
                if len(l) == 1:
                    Rule.s2r[r.name] = r 
                else: 
                    Rule.s2r[r.name] = RuleOr(r.name + '@' + str(len(Rule.rules)), l)
                    Rule.rules.append(Rule.s2r[r.name])
            r.setName(r.name + '@' + str(len(Rule.rules)))
            Rule.rules.append(r)        
        #--- in all rules resolve symbols to its rule 
        for r in Rule.rules:
            for i in range(len(r.body)):
                b = r.body[i]
                if type(b) == str:
                    if b[0] == "'" and b[-1] == "'" or b[0] == '"' and b[-1] == '"':
                        b = b[1: -1]
                    elif b in Rule.s2r:
                        b = Rule.s2r[b]
                    r.body[i] = b
                    assert b != PObj.end or r == Rule.rules[0]
            r.makePos()
            dbg(5, 'makePos body/pos len', len(r.body) , len(r.pos), r.strA('body','pos'))
        #--- generate gr, the left subrules recursively
        for r in Rule.rules:
            r.grGe0()
        u = True
        while u:
            u = False
            for r in reversed(Rule.rules):
                u |= r.grGen()
            dbg(10, 'grGen round')
        for r in Rule.rules:
            dbg(10, 'grGen', r.strA('body', 'gr'))
        Rule.genK(1)
        
#---------- RuleSeq: a Rule that is a sequence of terminals or rules ----------
class RuleSeq(Rule):
    def __init__(self, sym, bd):
        super().__init__(sym, bd)

    def makePos(self):
        super().makePos(len(self.body)+1)
        
    def gqGe3(self, gg, h):
        if len(self.body) == 0:
            dsAdd(gg, '', tuple(h +[self]))
        else:
            b = self.body[0]
            dsAdd(gg, b, tuple(h))
            if isinstance(b, Rule):
                b.gqGe2(gg, h)            
        
    def grGe0(self):
        """ compute inital value for gr"""
        self.gr = {self.body[0]: {self.pos[1]}} if len(self.body) > 0 and isinstance(self.body[0], Rule) else {}

    def prdGen(self):
        """ compute prd and prB from the values already computed (one step) """
        pp = self.pos[-1].prd
        ll = self.pos[-1].prB
        p0 = self.pos[0].prd
        l0 = self.pos[0].prB
        assert len(ll) == 0
        assert len(pp) == 1 and Rule.sqEmpty in pp and pp[Rule.sqEmpty] == {self}
        for bx in range(len(self.body)-1, -1, -1):
            b = self.body[bx]
            o = self.pos[bx]
            oN = self.pos[bx+1]
            pN = {}
            lN = {}
            if isinstance(b, str):
                a = b if PObj.sqStr else (b, )
                for p in pp.keys():
                    if len(p) < PObj.lrK:
                        dsAdd(pN, a+p, oN)
                    else:
                        dsAdd(lN, (a+p)[0:PObj.lrK], oN)
                for l in ll.keys():
                    dsAdd(lN, (a+l)[0:PObj.lrK], oN)
            else:
                for a, t in b.pos[0].prd.items():
                    if len(a) == PObj.lrK:
                        if PObj.sqEmpty in pp:
                            dsUnion(pN, a, t)
                        if any(len(p) > 0 for p in pp) or any(len(l) > 0 for l in ll):
                            dsUnion(lN, a, t)
                    else:
                        for p in pp.keys():
                            if len(a) + len(p) <= PObj.lrK:
                                dsUnion(pN, a+p, t)
                            else:
                                dsUnion(lN, (a+p)[0: PObj.lrK], t)
                        for l in ll.keys():
                            if len(a) + len(l) >= PObj.lrK:
                                dsUnion(lN, (a+l)[0: PObj.lrK], t)
                for a, t in b.pos[0].prB.items():
                    if len(a) == PObj.lrK:
                        dsUnion(lN, a, t)
            o.prd = pN
            o.prB = lN
            pp = pN
            ll = lN    
        return p0 != self.pos[0].prd or l0 != self.pos[0].prB

    def lahGen(self):
        """ compute lah from the values already computed of pos[-1].lah and prd and prB (one step) """
        u = False
        pp = self.pos[-1].lah
        nn = pp
        #dbg(1, 'lahGen0', self.strA('body'), pp)
        for qx in range(len(self.pos)-2, -1, -1):
            q = self.pos[qx]
            b = self.body[qx]
            if isinstance(b, Rule):
                u |= b.pos[-1].lahExt(nn)
            q.lahExt(pp)
            nn = q.lah
            #dbg(1, 'lahGen2', q.strA('lah', 'prd', 'prB'))
        return u

#---------- RuleOr: a Rule conisting of oa choice of terminals or rules ----------
class RuleOr(Rule):
    def __init__(self, sym, bd):
        super().__init__(sym, bd)

    def makePos(self):
        super().makePos(2)

    def gqGe3(self, gg, h):
        h.append(self.pos[1])
        for    b in self.body:
            dsAdd(gg, b, tuple(h))
            if isinstance(b, Rule):
                b.gqGe2(gg, h)
        h.pop()
                
    def grGe0(self):
        """ compute inital value for gr"""
        self.gr = {b: {self.pos[1]} for b in self.body}

    def prdGen(self):
        """ compute prd and prB from the values already computed (one step) """
        assert len(self.pos[-1].prB) == 0
        assert len(self.pos[-1].prd) == 1 and Rule.sqEmpty in self.pos[-1].prd and self.pos[-1].prd[Rule.sqEmpty] == {self}
        pp = {}
        ll = {}
        for b in self.body:
            if isinstance(b, str):
                dsAdd(pp, b if PObj.sqStr else(b,), self.pos[1])
            else:
                for a,t in b.pos[0].prd.items():
                    dsUnion(pp, a, t)
                for a,t in b.pos[0].prB.items():
                    dsUnion(ll, a, t)
        if self.pos[0].prd == pp and self.pos[0].prB == ll:
            return False
        self.pos[0].prd = pp
        self.pos[0].prB = ll
        return True

    def lahGen(self):
        """ compute lah from the values already  computed of pos[-1].lah and prd and prB (one step) """
        u = False
        pp = self.pos[-1].lah
        nn = {}
        for b in self.body:
            if isinstance(b, Rule):
                u |= b.pos[-1].lahExt(pp)
                # dbg(1, 'lahGen1', b.pos[-1].strA('rule', 'pos', 'lah'), 'nn=', nn)

        self.pos[0].lahExt(pp)
        return u

#---------- RulePos: position in a Rule ----------
class RulePos (PObj):
    def __init__(self, r, p):
        super().__init__(r.name + '#' + str(p))
        self.rule = r
        self.pos = p
        self.prB = {}
        self.prd = {}
        self.lah = {}
        
    def strL(self):
        return self.strA('rule', 'pos')

    def lahExt(self, ee):
        """ extend self.lah by ee, return True if lah has been updated"""
        u = False
        for a, p in self.prd.items():
            if len(a) == PObj.lrK:
                u |= dsUnion(self.lah, a, p)
            else:
                for e in ee.keys():
                    u |= dsUnion(self.lah, (a+e)[0: PObj.lrK], p)
        return u

#---------- State: a set of positions ----------
class State (PObj):
    """
        state is set of positions, not (set of positions, lookAhead) as for knuth
        thus, we reduce the number of states
    """
    p2s = {}
    state0 = None
    lrErr = None

    def __init__(self, pa):
        super().__init__('s' + str(len(State.p2s)))
        assert isinstance(pa, frozenset)
        assert all(isinstance(p, RulePos) for p in pa)
        assert pa not in State.p2s
        self.pa = pa
        self.go = None
        State.p2s[pa] = self    

    @staticmethod
    def getOrMake(pa):
        pa = frozenset(pa)
        return State.p2s[pa] if pa in State.p2s else State(pa)

    @staticmethod
    def clear():
        State.states = []
        State.p2s.clear()
        
    def strL(self):
        return self.strA('pa', 'go')
            
    @staticmethod
    def makeAll():
        State.state0 = State(frozenset([Rule.rules[0].pos[0]]))
        State.state0.goExp()        
        State.err = ''
    
        for s in State.p2s.values():
            dbg(5, 'goExp', s.strA('pa', 'go'))


        for lrK in range(1,5):
            dbg(1, 'gen lrK', lrK)
            if lrK > 1:
                Rule.genK(lrK)
            cnf = False
            for s in State.p2s.values():
                cnf |=  s.goFix()
            if not cnf:
                break
        else:
            State.err = f'still conflicts at lrK {lrK}'
        if dbgLev >= 5:
            dbg(5, 'makeAll lr={lrK}', State.err, '\n\t' + '\n\t'.join(s.strA('pa', 'go') for s in State.p2s.values()))
    
    def goExp(self):
        """ compute lr1 go, and recursively create necessary states """
        if self.go != None:
            return
        self.go = {}
        for p in sorted(self.pa, key=lambda p: p.name): # sort for debugging: to ensure sequence of states
            dsUpdate(self.go, p.lah if Rule.sqStr else {'' if len(a) == 0 else a[0]: t for a,t in p.lah.items()})
            r = p.rule
            if p.pos < 1:
                dsUpdate(self.go, r.gr)
            elif isinstance(r, RuleSeq) and p.pos < len(r.body):
                r = r.body[p.pos] 
                if isinstance(r, Rule):
                    dsAdd(self.go, r, p.rule.pos[p.pos+1])
                    dsUpdate(self.go, r.gr)
        for a, tt in self.go.items():
            t1 = frozenset(t for t in tt if isinstance(t, RulePos))
            if len(t1) > 0:
                s = State.getOrMake(t1)
                if isinstance(a, Rule):
                    assert tt == t1
                    self.go[a] = s
                else:
                    self.go[a] = (tt - t1) | {s}
                s.goExp()    

    def goFix(self):
        """ use lah, to extend go to a tree
            if conflicts (i.e. grammar nor lr(lrK) return True
            else adapt go accordingly
        """
        #--- compute lah
        if len(self.pa) == 1:
            ll = anyOf(self.pa).lah
        else:
            ll = {}
            for p in self.pa:
                dsUpdate(ll, p.lah)
        assert all(PObj.end not in k for k in ll.keys())

        #--- build tree of lookAheads and detect conflicts
        e = {}
        cnf = False
        for a, t in sorted(ll.items()): # sort for debugging: to ensure sequence of conflicts
            pp = {u for u in t if isinstance(u, RulePos)}
            if len(pp) < 1:
                tt = t
            else:
                assert len(a) > 0
                gg = self.go[a[0]]
                if not isinstance(gg, set):
                    if isinstance(gg, Rule):
                        if not isinstance(gg, State):
                            errS('f3', self, a, t, self.go[a[0]])
                    ddPut(e, *a, gg)
                    continue
                s = {u for u in gg if isinstance(u, State)}
                assert len(s) == 1
                s = anyOf(s)
                dbg(13, 'goFix confliXyy', a, pp, s)
                assert all(p in s.pa for p in pp)
                tt = (t - pp) | {s}
            if len(tt) == 1:
                ddPut(e, *(a if len(a) > 0 else ['']), anyOf(tt))
            else:
                dbg(3, 'goFix conflict in ', self.strA('pa'), ', lah=', strNSsq(a), ', to=', strSo(v.strA('pa') if isinstance(v, State) else str(v) for v in tt), sep='')
                cnf = True
        if cnf:
            return True        
        #--- find default and reduce tree
        if '' in e:
            dflt = e['']
        else:
            rr = {p.rule for p in self.pa if p.pos == len(p.rule.pos)-1}
            dflt = anyOf(rr) if len(rr) == 1 else None #anyOf(None
        assert dflt == None or isinstance(dflt, Rule)
        ddRed(e, dflt)
        dbgS(10, 'goFixa3 red dflt', dflt, e) 
        # merge tree into go
        for a,t in e.items():
            assert a not in self.go or isinstance(t, dict) or t == self.go[a] or {t} == self.go[a], strS('a=',a, 't=', t, self.strA('pa', 'go'))
            self.go[a] = t
        if dflt != None:
            dfltS = {dflt}
            for k in [k for k,v in self.go.items() if v == dflt or v == dfltS]:
                del self.go[k]
            self.go[''] = dflt
        dbg(5, 'goFixa9 go', self.strA('pa', 'go'))
        return False

    @staticmethod
    def checkLR0():
        """ check if the current LR(1) grammar is even LR(0)
            
            return None if it is, otherwsie a string describing the conflicting state

            LR(0) grammars are implemented using LR(1) grammars

            LR(0) means, that a reduce step does not depend on lookahead.
                a shift step however uses the char shifted (== lookahead) to determine the next state 
        """
        assert PObj.lrK == 1
        for u in State.states:
            assert len(u.to) > 0
            fi = None
            for k,v in u.to.items():
                if not isinstance(k, RuleSym):
                    if fi == None:
                        fi = v
                    elif v != fi if isinstance(v, Rule) else not (isinstance(v, State) and isinstance(fi, State)):
                        return strL("grammar not LR(0) because of",  u)

            
#---------- LahIter: input iterator ----------
class LahIter():
    """ input iterator with __getitem__ for lookahead """
    def __init__(self, l):
        self.list = l
        self.ix = -1

    def __iter__(self):
        return self
        
    def __next__(self):
        self.ix += 1
        if self.ix < len(self.list):
            return self.list[self.ix]
        raise StopIteration()
    def __getitem__(self, i):
        i += self.ix+1
        return '<before>' if i < 0 else self.list[i] if i < len(self.list) else PObj.end

#---------- Parser: the Parser ----------
class Parser:
    pCnt = -1

    @staticmethod
    def eStr(e):
        if type(e) != tuple and type(e) != list:
            return str(e)
        else:
            return str(e[0]) + '>' + str(len(e)-1) + '(' + ', '.join(x if type(x) != tuple and type(x) != list else str(x[0]) + '>' + str(len(x)-1) for x in e[1:]) + ')'
    @staticmethod
    def parse(inp):
        """ the parsing state machine """
        assert len(State.state0.pa) == 1
        assert len(q := State.state0.pa) == 1 and (q := anyOf(q)).pos == 0 and q.rule == Rule.rules[0]
        r0 = Rule.rules[0]
        dbgS(15, 'parsing for', r0, 'input', strNSsq(inp))
        
        st = [(None, State.state0)]
        inI = LahIter(inp)
        Parser.pCnt = -1
        cntL = 1000
        act = 'start'
        shiL = st[-1][1]
        res = None
        
        try:    # end of input or syntax may throw Keyerror exceptions
            while True:
                Parser.pCnt  += 1
                e,s = st[-1]
                lah = inI[0]
                dbgS(10, 'parse', (str(Parser.pCnt) + ' ' + act).ljust(20), 'lah', lah, 'stck', len(st), '>', e, s)
                if Parser.pCnt  > cntL:
                    err(f"too many loops cnt={Parser.pCnt}, lah={lah} st=", st)

                to = s.go
                dbg(15, 'parse to0', to)
                for fx in range(PObj.lrK): # dive into go tree along lookahead
                    to = to[inI[fx]] if inI[fx] in to else to['']  # Keyerror means syntax or end
                    dbg(20, 'parse to1', to)
                    if not isinstance(to, dict): break
                else:
                    errS('parser to deeply nested lah=', lah, 'to=', to, 's=', s.strA('pa', 'go'))
                if isinstance(to, State): # shift first lookahead
                    st.append((next(inI), to))
                    assert st[-1][0] == lah
                    shiL = to
                    act = 'shift ' + lah
                    lah = inI[0]
                elif isinstance(to, Rule): # reduce with rule to
                    sx = len(st) - len(to.pos) + 1
                    p = [to] + [x0 for x0, x1 in st[sx : ]]
                    act = 'reduce ' + str(to)
                    del st[sx:]
                    st.append((p, st[-1][1].go[to]))
                else:
                    err('parse else to', to, 'st=', st)
        except KeyError:
            if isinstance(to, State):
                dbgS(5, 'parse keyError State to=', to, act)
            elif isinstance(to, Rule):
                dbgS(5, 'parse keyError Rule to=', to, sx, p, act)
                res = p if len(st) == 1 else st[-1][0] # maybe we reduced st[1] !!!
            elif isinstance(to, dict):
                dbgS(5, 'parse keyError dict', to)

                res = st[-1][0]
            else:    
                err('parse keyError else', to)
        except StopIteration:
            err('parse internal')            
        
        dbg(20, 'parseEnd lah', lah, 'act', act, 'to', to
            , 'stack =\n\t' + '\n\t'.join(strS(str(sx) + ': ' + Parser.eStr(st[sx][0]), 'state =', st[sx][1]) for sx in range(len(st)-1, -1, -1)))
        if lah == PObj.end and len(st) <= 2 and res[0] == Rule.rules[0]:
            dbgS(5, 'parsed', str(res[0]), 'from', len(inp), 'tokens, in', Parser.pCnt, 'steps,', len(State.p2s), 'states')
            return res
        else:    
            tNr = inI.ix        
            iA = inp[tNr+1: ] + [PObj.end]
            iB = inp[0:tNr+1]
            ex = set() # compute expected: 
            for p in shiL.pa: 
                for q in p.lah.keys():
                    if len(q) < PObj.lrK:
                        q += PObj.sqEnd 
                    for i in range(min(len(iA), len(q))): # reduce amount of output if lrK>1, by showing only prefix of expected up to first difference to lah
                        if iA[i] != q[i]:
                            break 
                    ex.add(q[0: i+1])
            ex = ', '.join(strNSsq(q) for q in sorted(ex))
            r = f"syntax {'at begin' if tNr < 0 else 'after ' + inI[-1]} tokenNr {tNr} expected: {ex}, not lah: {strNSsq(iA[-5:])}"
            dbg(1, r)
            dbg(5, 'last tokens', iB[-5:], 'tokennr', tNr, ', lah', iA[0:5]
                , '\n\tpreceeding', iB[0: tNr+1]
                , '\n\tfollowing ', iA
                , '\n\tstack', len(st), 'res', res
                , '\n\t\t' + '\n\t\t'.join(strS(str(sx) + ': ' + Parser.eStr(st[sx][0]), 'state =', st[sx][1]) for sx in range(len(st)-1, -1, -1))
                )
            return r

    @staticmethod
    def makeGr(*grammar):
        "make Rule, Sym and Pos and State for given grammer"
        out("grammar source", grammar)
        Rule.clear()
        Rule.add(*grammar)
        Rule.finish()
        State.makeAll()

    @staticmethod
    def testGr(nm, syn, *cs):
        """test named nm, with grammar syn and testcases cs"""
        print(f"{nm} --- begin test " + strSep1)
        Parser.makeGr(*syn)
        if State.err != '':
            print('error generating grammar', nm, State.err)
        else:
            Parser.test(nm, *cs)

    @staticmethod
    def test(nm, *tsts):
        """ test for the current grammar, named nm the tests tsts """
    #    if tsts == None:
    #        tsts = ['2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (']
        ll = 5
        def treeSplit(nd, b):
            """split the tree nd into linkes, 
                each line contains the path to one (shifted) token or to one empty rule
                intendation shows the children of a rule
                the left ll characters, contain the shifted token or spaces for an empty rule
                b is the current line, filled with the path of entered rules with the correct intendation
            """
            if isinstance(nd, str):
                return nd[0:ll-1].ljust(ll) + b[ll:] + '==' + nd
            c = b + str(nd[0]) +  ' ' 
            if len(nd) <= 1:
                return c + '==emptyRule'
            r = treeSplit(nd[1], c)
            c = len(c) * ' '
            for ch in nd[2:]:
                r += "\n" + treeSplit(ch, c)
            return r

        for i in range(len(tsts)) :
            sT = re.findall(r'\w+|\S', tsts[i]) 
            print(f"test begin  {nm} {i} input: {strNSsq(sT)} {strSep1}")
            assert all(len(t) == 1 if PObj.sqStr else len(t) > 0 for t in sT) 
            ast = Parser.parse(sT)
            if isinstance(ast, str):
                print(f'syntax test {nm} {i} : {ast}\n\tinput {strNSsq(sT)}')
            else:
                print(f"parsed {str(ast[0])} from {len(sT)} tokens, {Parser.pCnt} steps,  {len(State.p2s)} states, Rules {len(Rule.rules)}")
                print(treeSplit(ast, ll * ' '))
            dbg(5, 'test', 'syntaxed' if isinstance(ast, str) else 'parsed', f"{nm} {i} input {strNSsq(sT)} {strSep1}")

    @staticmethod
    def test1():
        """my simple tests """
        g = ["s=s+p", "s = p", "p = p '*' i", "p = i", "i = '(' s           ')'", "i = '0'", "i = '1'", "i = '2'"]
        Parser.testGr('arithExpr', g
            ,'2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +')
        
        g2 = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = epm i d", "epm = pm", "epm ="
                , "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
        g = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = i d", "epm = pm epm", "epm ="
                , "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]

        Parser.testGr('arithExprPM' , ["s=s pm p", "s=p", "p=p*i", "p=p/i", "p=i", "i=(  s )", "i = j"
            , "j=epm d", "j=j d", "epm = pm epm", "epm =  ", "pm=+", "pm=-", "d=0", "d=1", "d =  2  "]
            ,'2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
            , '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )', '- - 2 * * ++1 ** -2 * 1 + 0')

        Parser.testGr('arithExprPE' , ["s=s pm p", "s=p", "p=p*e", "p=p/e", "p=e", "e=i", "e=i**e", "i=(  s )", " i=j"
            , "j=epm d", "j=j d", "epm = pm epm", "epm =  ", "pm=+", "pm=-", "d=0", "d=1", "d =  2  "]
            ,'2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
            , '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )', '- - 2 * * ++1 ** -2 * 1 + 0')

        Parser.testGr('arithExprPE2' , ["s=s pm p", "s=p", "p=p*e", "p=p/e", "p=e" 
            , "e=i", "e=i**e", "e=pm e", "pm=+", "pm=-", "i=(  s )", "i = dd", "dd=d", "dd=d dd", "d=0", "d=1", "d =  2  "]
            ,'2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
            , '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )', '- - 2 * * ++1 ** -2 * 1 + 0')

        Parser.testGr('LR(2)?' , ["S = AA 'a' 'a'", "S = AB 'a' 'b'", "AA =", "AB ="]
            ,"a a", "a b", "a c", "b a")

    @staticmethod
    def testKnuth():
        """some of the examples given by Donald E. Knuth. 1965. On the Translation of Languages from Left to Right """
        Parser.testGr('knuth(1)'
            , ["S = A D", "A = 'a' C", "B = 'b' 'c' 'd'", "C = B E", "D =", "E = 'e'"]
            , "a b c d e", "a b x d e", "a b", "a b c d e f")
        Parser.testGr('knuth(6)'
            , ["S = 'a' A 'c'", "A = 'b' A 'b'", "A = 'b'"]
            , "a b b b  c", "a b c", "a b b c", "a b b b b b b b c e")
        Parser.testGr('knuth(7) lr(0)'
            , ["S = 'a' A 'c'", "A =  A 'b' 'b'", "A = 'b'"]
            , "a b b b  c", "a b c", "a b b c", "a b b b b b b b c e")
        Parser.testGr('knuth(16x) lr(3)'
            , ["S=B C", "B=C e", "B = ", "C=D", "C=D c", "D=", "D=d"]
            , "", "c", "c e", "c e d")
        Parser.testGr('knuth(24)'
            , ["S =",  "S = 'a' A 'b' S", "S =  'b' B 'a' S", "A =", "A = 'a' A 'b' A", "B =", "B = 'b' B 'a' B"]
            , "", "a b", "b a", "a b b", "a a b b", "a b a b", "b a a b")

#---------- Test: div internal tests ----------
class Test:
    def re():
        x = ' ab cd  e?f   *+??'
        print(f"x={x}")
        print("re: lookahead assertion and lookbehind assertions to find boundaries between words and or special character")
        for y in re.finditer(r'\s+|(?<=\S)(?=\S)((?<=\W)|(?=\W))', x):
            print([y.start(),y.end(), x[y.start(): y.end()+1]])
        print("easier and more stable against leading/trailing spaces: use findall for words or special chars")
        for y in re.finditer(r'\w+|\S', x):
            print([y.start(),y.end(), x[y.start(): y.end()]])
        print(re.findall(r'\w+|\S', x))
        x += "' '' b 'oder1''1.2und. 3 ' '.a.234 4.. single' !"
        print('mit strings and numbers:', x)
        print(re.findall(r"""'(?:''|[^'])*'    # single quoted strings, doubble apostrophs for a single
                                            # non capturing group (?:...) otherwise findall will return groups!
                            |\d+(?:\.\d*)    # number with leading digits and possible decimal point/digits
                            |\.\d+            # number with leading decimal points and digits
                            |\w+            # words
                            |\S"""            # special chars
                    , x
                    , re.X))                    # re.X = re.VERBOSE flag

    def dd():
        d = {}
        ddPut(d, 0, 'null')
        ddPut(d, 'a', 'b', 'ab')
        dbg(1, 'a, b', d)
        ddPut(d, 'a', 'c', 'ac')
        dbg(1, 'a, c', d)
        ddPut(d, 'a', 'c', 'd', 'acd')
        dbg(1, 'a, c, d', d)
        ddPut(d, 'a', 'c', 'ac++')
        dbg(1, 'a, c', d)
        dbg(1, 'ddGet a', ddGet(d, 'a'))
        dbg(1, 'ddGet y', ddGet(d, 'y'))
        dbg(1, 'ddGet a b', ddGet(d, 'a', 'b'))
        dbg(1, 'ddGet a y', ddGet(d, 'a', 'y'))
        dbg(1, 'ddGet a c d', ddGet(d, 'a', 'c', 'd'))
        dbg(1, 'ddGet a c d y', ddGet(d, 'a', 'c', 'd', 'y'))


    def LahIter():
        iter = LahIter(['begin', 0, 1,2,3,4])
        for i in iter:
            print(i, iter[0], iter[1], iter[-1])

    def range():
        l = [0, 1, 2]
        s = '012'
        print([(i, l[0:i], s[0:i]) for i in range(6)])
        print([(x, y) for x in range(3) for y in range(2)])
        print([(x, y) for x in range(3) for y in range(3*x, 4*x+1)])
        print([(x, y) for x in range(3) if x > 0 for y in range(3*x, 4*x+1) if y > x])

    @staticmethod
    def all():
        out('Test.all', 2 * strSep1)
        out('Test.re regular expressions', strSep1)
        Test.re()
        out('Test.dd dict dict tree', strSep1)
        Test.dd()
        out('Test.LahIter iterator with lookahead', strSep1)
        Test.LahIter()
        out('Test.range', strSep1)
        Test.range()
        out('Test.all end', 2 * strSep1)

if 0: 
    Test.all()
else:    
    Parser.test1()
    Parser.testKnuth()
print('end', __file__)