python/parser3.py

# knuth lr(k) parser

def err(*e):
    raise Exception('error:' + ', '.join(e))
    exit()

dbgLev = 0
dbgTime = 0
def dbg(l, *e):
    if dbgLev >= l:
        print(*e)
from datetime import * ;
import re;

class PObj:
    "root and container for all our objects"
    end = '@'
    rStart = 'start'
    lahStr = True # lah is true ==> a string, False => tuple of strings
    objs = []
    lrK = 0
    def __init__(self, nm = ''):
        self.name = f"{self.__class__.__name__ if nm == '' else nm}#{len(self.objs)}"
        self.objs.append(self)

    def __str__ (self):
        return self.name

    def __repr__ (self, at=''):
        return self.name

    def sExp (self, at=''):
        return self.name if at == '' else f'{self.name}{{{at}}}'

    @staticmethod
    def dbg(l, m):
        if dbgLev >= l:
            print(m)
            p = '['
            for o in PObj.objs:
                print(f"{p} {o.sExp()}")
                p = ','
            print(']')

class Rule (PObj):
    "a Rule has a name (a symbol or nonterminal) and a body of terminals and nonterminals"
    rules = None
    def __init__(self, sy, ty, bd):
        self.sym = sy
        super().__init__(f"R#{sy}")
        self.body = bd
        self.pos = ty
        self.lah = set()
        self.prd = set()

    def sExp (self, at=''):
        return super().sExp(f'sym={self.sym!s} body={self.body} pos0={self.pos[0] if self.pos != None else "-"}')

    @staticmethod
    def clear(): 
        rules = None
        State.clear()    
        PObj.objs.clear()
        RuleSym.n2y.clear()
        RuleSym.syms = None
        RulePos.poss = None
        Rule(PObj.rStart, 'n', ["?"])

    @staticmethod
    def addSA(*src):
        "add new rules: each rule is a string, elements separated by one or more spaces, terminals enclosed in apostrophs"
        for s in src:
            spl = re.split(r'\s+', s)
            assert(spl[1] == '=')
            Rule(spl[0], 'a', spl[2:])

    @staticmethod
    def addWN(*src):
        "add new rules: each rule is a string, elements are Words or special chars, terminals those not a rule name"
        for s in src:
            spl = re.findall(r'\w+|\S', s)
            assert(spl[1] == '=')
            Rule(spl[0], 'n', spl[2:])


    @staticmethod
    def makeSymPos():
        "make symbols and positions, after we got the source of all rules, convert body to symbols and bare strings"
        PObj.objs[0].body[0] = PObj.objs[1].sym
        Rule.rules = Rule.objs.copy()
        PObj.dbg(4, 'made rules')
        RuleSym.make()
        PObj.dbg(4, 'made sym')
        RulePos.make()
        PObj.dbg(3, 'made pos')

    @staticmethod
    def prdLah(k):
        """ compute for all rules 
             prd = productions of length <= k and
            lah = prefixes of lenght k of all productions with length > k
             in the outer loop increases lrK 1 by 1 """

        tS = datetime.now()
        assert not PObj.lahStr or k <= 1 or (len(PObj.end) == 1 
                and all([len(y) == 1 for r in Rule.rules for y in r.body if not isinstance(y, RuleSym)]))
        cnt = 0
        upd = True
        while upd: # recomute all rules, until no change
            upd = False
            lC = 0
            pC = 0
            for r in Rule.rules:
                cnt += 1
                if r.prdLahComb(k):
                    upd = True
                lC += len(r.lah)
                pC += len(r.prd)
        elT = f"{(datetime.now() - tS).total_seconds()}sec, " if dbgTime else ''
        print(f"Rule.prdLah k={k}<-{PObj.lrK}: {cnt} {elT} #prd={pC} #lah={lC}")
        if dbgLev >= 3:
            for r in Rule.rules:
                dbg(3, f"{r} {r.body} ---prd {r.prd} ---lah {r.lah}")
        assert all([len(s) == k for s in r.lah for r in Rule.rules])
        assert all([len(s) <= k for s in r.prd for r in Rule.rules])
        PObj.lrK = k

    def prdLahComb(self, k):
        "compute lah and prd combining rules and already computed prd and lah"

        ee = set()
        oL = self.lah.copy()
        lP = { '' if PObj.lahStr else ()}
        for y in self.body:
            if isinstance(y, str):
                aP = { y if PObj.lahStr else (y,) }
                aL = ee
            else:
                aP = ee.union(*[r.prd for r in y.rules])
                aL = ee.union(*[r.lah for r in y.rules])
            lQ = set()
            for p in lP:
                for a in aP:
                    n = p + a
                    if len(n) <= k:
                        lQ.add(n)
                    else:
                        self.lah.add(n[0:k])
                for a in aL:
                    if len(p) + len(a) >= k:
                        self.lah.add((p + a)[0:k])
            lP = lQ
        assert lP >= self.prd
        assert self.lah >= oL
        lL = self.lah - lP
        s = {x for x in lL if len(x) < k} 
        if len(s) > 0:
            lL -= s
            rm = self.lah | lP
            while True:
                rm = {x[0:-1] for x in rm if len(x) > 0}
                s -= rm
                if len(s) == 0 or len(rm) == 0:
                    break 
            lL |= s
        upd = lP != self.prd or oL != lL
        self.prd = lP
        self.lah = lL
        dbg(4, f"self {self} upd{upd} prd {self.prd} lah {self.lah}")
        return upd

class RuleSym (PObj):
    "the symbols are our nonterminals, which are a set of rules"
    n2y = {}
    syms = None
    def __init__(self, sym):
        self.sym = sym
        super().__init__(f"Y#{sym}")
        self.rules = []
        self.lah = set()
    def sExp (self):
        return super().sExp(f"sym={self.sym!s} {self.rules} lah={self.lah}")

    @staticmethod
    def make():
        "make the symbols from defined rules and convert Rule.body to symbols and bare strings"
        i0 = len(PObj.objs)
        for r in Rule.rules: #collect symbols
            if r.sym not in RuleSym.n2y:
                RuleSym.n2y[r.sym] = RuleSym(r.sym)
            RuleSym.n2y[r.sym].rules.append(r)
            r.sym = RuleSym.n2y[r.sym]
        RuleSym.syms = PObj.objs[i0:]
        assert Rule.objs[0].sym == RuleSym.syms[0]
        assert RuleSym.syms[0].sym == PObj.rStart
        assert RuleSym.n2y[PObj.rStart] == RuleSym.syms[0]
        assert len(RuleSym.n2y[PObj.rStart].rules) == 1
        for r in Rule.rules: # convert Rule.body to symbols and bare strings
            b = r.body
            if r.pos == 'a':
                for ix in range(len(b)):
                    v = b[ix]
                    if v[0] == "'" and v[-1] == "'":
                        b[ix] = v[1: len(v)-1]
                        assert b[ix] != PObj.end
                    elif v in RuleSym.n2y:
                        b[ix] = RuleSym.n2y[v]
                    else:
                        err(f"rule {v} in {r} not in {Rule.rules}")
            elif r.pos == 'n':
                for ix in range(len(b)):
                    v = b[ix]
                    if v in RuleSym.n2y:
                        b[ix] = RuleSym.n2y[v]
            else:
                err(f'bad pos {r.pos} in {r.sExp()}')

class RulePos (PObj):
    poss = None
    def __init__(self, r, p):
        super().__init__('P')
        self.rule = r
        self.pos = p
        self.next = None
        self.lah = None
    def sExp (self):
        return super().sExp(f"{self.rule!s} {self.pos} next={self.next!s}")

    @staticmethod
    def make():
        l = len(PObj.objs)
        for r in Rule.rules:
            b = r.body
            r.pos = [RulePos(r, ix) for ix in range(len(b)+1)]
            for ix in range(len(b)):
                r.pos[ix].next = r.pos[ix+1]
        RulePos.poss = PObj.objs[l:]

    def makeLah(k):
        # compute lah for RuleSym and RulePos using lah from Rule
        #     lah = production of len <= k and prefix of len k of all production longer k
        Rule.prdLah(k)
        for y in RuleSym.syms:
            y.lah = set().union(*[r.lah | r.prd for r in y.rules])
            dbg(3, f"{y} {y.lah}")
        for r in Rule.rules:
            r.pos[-1].lah = {'' if PObj.lahStr else ()}
            for pX in range(len(r.body)-1, -1, -1): # backward through the positions
                p = r.pos[pX]
                e = r.body[pX]
                lS = e.lah if isinstance(e, RuleSym) else {e if PObj.lahStr else (e,)}
                n = set()
                for l in lS:
                    if len(l) == k:
                        n.add(l)
                    else:
                        for o in p.next.lah:
                            dbg(9, f"n.add(({l}+{o})[0:k])")
                            n.add((l+o)[0:k])
                p.lah = n
                dbg(3, f"{r} {pX} {p} {p.lah}")
            assert r.pos[0].lah == r.prd | r.lah, (f"rule {r.sExp()} fails \npos[0].lah={r.pos[0].lah}" 
                    f" \n== prd={r.prd} \n| lah={r.lah}"
                    f",\ndifference lah-..={r.pos[0].lah-r.prd-r.lah}, ..-lah={(r.prd|r.lah)-r.pos[0].lah}")

class State (PObj):
    pa2u = {}
    st0 = None
    states = None    
    lrErr = None
    def __init__(self, pa):
        super().__init__('S')
        assert isinstance(pa, frozenset)
        assert pa not in State.pa2u
        self.pa = pa
        self.to = {}
        dbg(3, f"State {self.sExp()}")
        State.pa2u[pa] = self    
        for a in pa:
            self.addPosA(a)
        
    def sExp (self):
        return super().sExp(f"{self.pa} to={self.to}") 

    def addPosA(self, pa):
        "add (position, the string after the the end of the current rule) to self, recursively add subrules"
        p, a = pa
        if State.lrErr != None:
            return
        if p.next == None:    #after all positions of rule ==> reduce
            if a not in self.to:
                self.to[a] = p.rule
            elif self.to[a] != p.rule:
                State.lrErr = f"lrK mismatch [{a}]={self.to[a]} != p={p.sExp()}"
        else:
            e = p.rule.body[p.pos]
            k = PObj.lrK
            t = (p.next, a)
            if isinstance(e, str): #shift character on stack with next state
                for l in p.lah:
                    assert l[0] == e
                    if len(l) != k:
                        l = (l+a)[0:k] 
                    if l not in self.to:
                        self.to[l] = {t}
                    elif not isinstance(self.to[l], set):
                        State.lrErr = f"lrK mismatch [{l}]->{self.to[l]} cannot add {t}"
                        return
                    else:
                        self.to[l].add(t)
            else: 
                assert isinstance(e, RuleSym)    
                if e not in self.to: # next state after this symbol
                    self.to[e] = {t}
                elif t in self.to[e]:
                    return
                else:
                    self.to[e].add(t)
                for n in p.next.lah: # enter pos[0] of subrules, string after is lah of next position of rule
                    if len(n) != k:
                        n = (n+a)[0:k] 
                    for r in e.rules:
                        self.addPosA((r.pos[0], n))
                        if State.lrErr != None:
                            return

    def addTrans(self):
        "modify to: find/create states for (pos, aft) sets in to.values"
        for k, v in self.to.items():
            if isinstance(v, Rule):
                continue
            pa = frozenset(v)
            if pa in self.pa2u:
                self.to[k] = self.pa2u[pa]
            elif State.lrErr != None:
                return
            else:
                n = type(self)(pa)
                self.to[k] = n
                n.addTrans()

    @staticmethod
    def clear():
        if State.st0 != None:
            del PObj.objs[State.st0:]
            st0 = None
        State.states = None
        State.pa2u.clear()

    @staticmethod
    def make(k):
        "make state for LR(k) grammar, return errormessage if not LR(k)"
        assert k >= 1
        State.clear()
        RulePos.makeLah(k)
        State.st0 = len(PObj.objs)
        State.lrErr = None
        b = State(frozenset([(Rule.rules[0].pos[0], k * (PObj.end if PObj.lahStr else (PObj.end, )))]))
        e = State(frozenset(b.to[RuleSym.syms[1]]))
        b.addTrans()
        e.addTrans()
        State.states = PObj.objs[State.st0: ]
        Parser.states = State.states
        if State.lrErr != None:
            return f"grammar not LR({k}), because of {State.lrErr}"

    @staticmethod
    def checkLR0():
        """ check if the current LR(1) grammar is even LR(0)
            
            return None if it is, otherwsie a string describing the conflicting state

            LR(0) grammars are implemented using LR(1) grammars

            LR(0) means, that a reduce step does not depend on lookahead.
                a shift step however uses the char shifted (== lookahead) to determine the next state """
        assert PObj.lrK == 1
        for u in State.states:
            assert len(u.to) > 0
            fi = None
            for k,v in u.to.items():
                if not isinstance(k, RuleSym):
                    if fi == None:
                        fi = v
                    elif v != fi if isinstance(v, Rule) else not (isinstance(v, State) and isinstance(fi, State)):
                        return f"grammar not LR(0) because of {u.sExp()}"

class Parser:
    states = None
    pCnt = 0

    @staticmethod
    def parse(inp, states = None):
        if states == None:
            states = Parser.states
        st = [[states[0], 'st 0']]
        stFi = states[1]

        lahLen = max(1, PObj.lrK)
        lahStr = PObj.lahStr

        if lahStr:
            lahEnd = lahLen * PObj.end 
            inI = iter(inp + lahLen * [PObj.end])
            dbg(3, f"iter coll {inp + lahLen * [PObj.end]}")
            lah = ''.join([next(inI) for i in range(lahLen)])
        else:
            lahEnd = PObj.lrK * (PObj.end, )
            inI = iter(inp + list(lahEnd))
            lah = tuple([next(inI) for i in range(lahLen)])

        dbg(3, f"parse lrK={PObj.lrK} len={lahLen} lah={lah}")
        Parser.pCnt = -1
        cntL = (len(inp) + lahLen) * len(Rule.rules)
        ls = ''
        while True:
            Parser.pCnt  += 1
            if Parser.pCnt  > cntL:
                err("too many loops cnt={Parser.pCnt}, lah={lah} st={st}")
            tos = st[-1]
            dbg(2, f'run{Parser.pCnt} {ls} --> lah={lah} tos={tos}, st={len(st)} {st}')
            m = tos[0]
            if lah not in m.to:
                return(f"bad lah={lah} expected {list(m.to.keys())} at {m.sExp()}")
            go = m.to[lah]
            if isinstance(go, State): #shift
                ls = "shift"
                if not lahStr:
                    st.append([go, lah[0]])
                    lah = lah[1:] + (next(inI), )
                elif lahLen == 1:
                    st.append([go, lah])
                    lah = next(inI)
                else:
                    st.append([go, lah[0]])
                    lah = lah[1:] + next(inI)
            elif isinstance(go, Rule):    
                ls = f"red {go}"
                n = [go]
                if len(go.body) > 0:
                    nx = len(st) - len(go.body)
                    for i in range(nx, len(st)):
                        n.append(st[i][1])
                    del st[nx+1:] 
                else:
                    nx = len(st)
                    n.append('')
                    st.append(None)
                pa = st[nx-1][0]
                if go.sym not in pa.to:
                    if lah == lahEnd and len(st) == 2 and st[1][0] == stFi:
                        return st[1][1]
                    return f'lah={lah} sym {go.sym} missing in pa {pa}, stack {st}'
                dbg(2, f'run{Parser.pCnt}go {ls} --> pa={pa} go.sym={go.sym}')
                st[nx] = [pa.to[go.sym], n]            
            else:
                err(f"bad go {go}, lah={lah}, st={st}")

    @staticmethod
    def makeGr(f, *grammar):
        "make Rule, Sym and Pos for given grammer in form for function f"
        dbg(1, f"grammar in form {f.__name__}, source {grammar}")
        PObj.lrK = 0
        Rule.clear()
        f(*grammar)
        Rule.makeSymPos()

    @staticmethod
    def makeState():
        "try to make lr(k) states for k=0,1, ..."
        tS = datetime.now()
        k = 0
        st = f"{len(RuleSym.syms)} syms, {len(Rule.rules)} rules, {len(RulePos.poss)} rulePos"
        while True:
            k += 1
            res = State.make(k)
            el = (datetime.now() - tS).total_seconds()
            elT = f"after {el}sec: " if dbgTime else ''
            if res == None:
                break
            print(f"{res}: {elT}{st} {len(State.states)} states")
            if el > 7:
                print("stop by timeout, do not check next LR(k)")
                return False
            if k > 20:
                print("stop by k >Y 20, do not check next LR(k)")
                return 

        if k == 1:
            res = State.checkLR0()
            if res == None:
                k=0
            else:
                print(f"{res}: {elT}")
        print(f'grammar is LR({k}): {elT}{st} {len(State.states)} states')
        Parser.states = State.states
        PObj.dbg(1, 'State 0')
        return True

    @staticmethod
    def testGr(nm, f, syn, *cs):
        """test named nm, with grammar syn (in form for function f) and testcases cs"""
        print(f"{nm} --- begin test -----------")
        Parser.makeGr(f, *syn)
        if Parser.makeState():
            Parser.test(nm, *cs)

    @staticmethod
    def test(nm, *tsts):
        "test for the current grammar, named nm the tests tsts"
        if tsts == None:
            tsts = ['2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (']
        trS = 0
        def treeSplit(nd, pr):
            nonlocal trS
            trS += 1
            if isinstance(nd, str):
                return pr + nd
            r = pr + str(nd[0]) + '/' + str(len(nd[0].body))
            for ch in nd[1:]:
                r += treeSplit(ch, pr + '  ')
            return r

        def treeSpli2(nd, of):
            nonlocal trS
            trS += 1
            if isinstance(nd, str):
                return nd
            r = str(nd[0]) + '/' + str(len(nd[0].body)) + ' '
            of += len(r)
            r += treeSpli2(nd[1], of)
            for ch in nd[2:]:
                r += "\n" + of * ' ' + treeSpli2(ch, of)
            return r

        for i in range(len(tsts)) :
            print(f"{nm} test {i} input {tsts[i]} ------>>>")
            sT = [x for x in re.split(r'\s+', tsts[i]) if x != ''] # void empty strings, e.g. for ''
            ast = Parser.parse(sT)
            if isinstance(ast, str):
                print(f'error run: {ast}')
            else:
                #print('run', tsts[i] , ' ==>', treeSplit(ast, "\n  "))
                print(' ', treeSpli2(ast, 2))
                print(f"  stats pCnt {Parser.pCnt}, inp {len(sT)}, lrK {PObj.lrK}, Rules {len(Rule.rules)}")

    @staticmethod
    def testSAT(nm, syn, *cs):
        """test named nm, with grammar syn (in form SA) and testcases cs"""
        Parser.testGr(nm, Rule.addSA, syn, *cs)

    @staticmethod
    def testWNT(nm, syn, *cs):
        """test named nm, with grammar syn (in form WN) and testcases cs"""
        Parser.testGr(nm, Rule.addWN, syn, *cs)

    @staticmethod
    def test1():
        """my tests """
        g = ["s = s '+' p", "s = p", "p = p '*' i", "p = i", "i = '(' s           ')'", "i = '0'", "i = '1'", "i = '2'"]
        Parser.testSAT('arithExpr', g
            ,'2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +')
        
        g2 = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = epm i d", "epm = pm", "epm ="
                , "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
        g = g[1:-3] + ["s = s pm p", "p = p '/' i", "i = epm d", "i = i d", "epm = pm epm", "epm ="
                , "pm = '+'", "pm = '-'", "d = '0'", "d = '1'", "d = '2'"]
        Parser.testWNT('arithExprPM' , ["s=s pm p", "s=p", "p=p*i", "p=p/i", "p=i", "i=(  s ) "
            , "i=epm d", "i=i d", "epm = pm epm", "epm =  ", "pm=+", "pm=-", "d=0", "d=1", "d =  2  "]
            ,'2', '1  +  2 * 0', '2 * ( 1 * 2 + 0 )', '1 + 2 (', '1 + 2 +'
            , '1 0 / - - 1 1 + + - 1 2 / ( 1 - - 2 / + 0 )')

        Parser.testSAT('LR(2)?' , ["S = AA 'a' 'a'", "S = AB 'a' 'b'", "AA =", "AB ="]
            ,"a a", "a b", "a c", "b a")

    @staticmethod
    def testKnuth():
        """some of the examples given by Donald E. Knuth. 1965. On the Translation of Languages from Left to Right """
        Parser.testSAT('knuth(1)'
            , ["S = A D", "A = 'a' C", "B = 'b' 'c' 'd'", "C = B E", "D =", "E = 'e'"]
            , "a b c d e", "a b x d e", "a b", "a b c d e f")
        Parser.testSAT('knuth(6)'
            , ["S = 'a' A 'c'", "A = 'b' A 'b'", "A = 'b'"]
            , "a b b b  c", "a b c", "a b b c", "a b b b b b b b c e")
        Parser.testSAT('knuth(7) lr(0)'
            , ["S = 'a' A 'c'", "A =  A 'b' 'b'", "A = 'b'"]
            , "a b b b  c", "a b c", "a b b c", "a b b b b b b b c e")
        Parser.testWNT('knuth(16x) lr(3)'
            , ["S=B C", "B=C e", "B = ", "C=D", "C=D c", "D=", "D=d"]
            , "", "c", "c e", "c e d")
        Parser.testSAT('knuth(24)'
            , ["S =",  "S = 'a' A 'b' S", "S =  'b' B 'a' S", "A =", "A = 'a' A 'b' A", "B =", "B = 'b' B 'a' B"]
            , "", "a b", "b a", "a b b", "a a b b", "a b a b", "b a a b")


def testRe():
    x = ' ab cd  e?f   *+??'
    print(f"x={x}")
    print("re: lookahead assertion and lookbehind assertions to find boundaries between words and or special character")
    for y in re.finditer(r'\s+|(?<=\S)(?=\S)((?<=\W)|(?=\W))', x):
        print([y.start(),y.end(), x[y.start(): y.end()+1]])
    print("easier and more stable against leading/trailing spaces: use findall for words or special chars")
    for y in re.finditer(r'\w+|\S', x):
        print([y.start(),y.end(), x[y.start(): y.end()]])
    print(re.findall(r'\w+|\S', x))
    x += "' '' b 'oder1''1.2und. 3 ' '.a.234 4.. single' !"
    print('mit strings and numbers:', x)
    print(re.findall(r"""'(?:''|[^'])*'    # single quoted strings, doubble apostrophs for a single
                                        # non capturing group (?:...) otherwise findall will return groups!
                        |\d+(?:\.\d*)    # number with leading digits and possible decimal point/digits
                        |\.\d+            # number with leading decimal points and digits
                        |\w+            # words
                        |\S"""            # special chars
                , x
                , re.X))                    # re.X = re.VERBOSE flag

if 0:
   testRe()
   exit()
if 0:
    l = [0, 1, 2]
    s = '012'
    print([(i, l[0:i], s[0:i]) for i in range(6)])
    print([(x, y) for x in range(3) for y in range(2)])
    print([(x, y) for x in range(3) for y in range(3*x, 4*x+1)])
    print([(x, y) for x in range(3) if x > 0 for y in range(3*x, 4*x+1) if y > x])
    exit()

Parser.test1()
Parser.testKnuth()
print('end', __file__)