php/suffixTree.php

<form action="<?php echo basename(__file__); ?>" method="post">
<h2><?php echo basename(__file__); ?> compute suffix tree für String</h3>
<?php 
    $src = isset($_POST['string']) ? $_POST['string'] : 'kakao';
    traceSet(isset($_POST['trace']) ? $_POST['trace'] : 1);
    $withTrie = isset($_POST['trie']);
?>

<p>string: <input type="text" name="string" size="100" value=<?php echo '"' . $src . '"'; ?> /></p>
<p><input type="checkbox" name="trie" <?php if ($withTrie) echo "checked"; ?> />also suffixTrie &nbsp;&nbsp;&nbsp; 
<input type="input" name="trace" maxlength=2 size=2 value="<?php echo $trace; ?>" />trace &nbsp;&nbsp;&nbsp; 
<p><input type="submit" value="compute tree" /></p>
<?php if ($withTrie) {
    echo "<h2> suffix trie für $src </h2>";
    $t = suffixTrie($src);
    echo "srcLen=" . strLen($src) . " treeCount=" . count($t); 
    showTable($t); 
    }
?>
<h2> suffix tree für <?php echo "len=" . strlen($src) . ": $src"; ?> </h2>
<?php 
suffixTree($src);
say($suffixTreeInfo . suffixTree2Table(), 4); 
?>
<h2>Source</h2>
<?php echo highlightNum(__file__); ?>
</form>
<?php

function suffixTrie($src) {
    $fx = 0;
    trace("start of suffixTrie $src", 1, 1);
    $t = [];
    suffixTrieAdd($t, 0, '', 0, 0);
    $t[0]['substr'] = $src;
    for ($cy=0; $cy < strlen($src); $cy++) {
        $ch = substr($src, $cy, 1);
        $fx = $nx = suffixTrieAdd($t, $fx, 0, $cy);     #node for longest substring
        trace("for1 cy=$cy nx=$nx");
        for ($cx=1; $t[$nx]['par']; $cx++) {             # handle all suffixes
            trace("for2 cy=$cy nx=$nx cx=$cx");
            $st = $t[$t[$nx]['par']]['suf'];              # suffix of parent
            if ($sx = suffixTrieChild($t, $st, $ch)) { 
                $t[$nx]['suf'] = $sx;                     # found follower character - remaining boundary path ok
                break;
                }
            $nx = $t[$nx]['suf'] = suffixTrieAdd($t, $st, $cx, $cy); # add current character
            }
        }
    trace("end of suffixTrie $src", 1, 2);
    return $t;
}

function suffixTrieAdd(&$t, $px, $cx, $cy) {
    $nx = count($t);
    trace("add nx=$nx cx=$cx $cy");
    $t[$nx] = ['par' => $px, 'cFi' => $cx, 'cLa' => $cy, 'suf' => 0, 'chi' => 0];
    $n = &$t[$nx];
    if ($nx) {
        $src = $t[0]['substr'];
        $n['sis'] = $t[$px]['chi'];
        $t[$px]['chi'] = $nx;
        $n['substr'] = str_repeat('_', $cx) . substr($src, $cx, $cy + 1 - $cx);
        $n['delta'] = substr($src, $cy, 1);
        }
    else {
        $n['sis'] = 0;
        $n['substr'] = $n['delta'] = '';
        }
    return $nx;
}

function suffixTrieChild(&$t, $px, $ch) {
    for ($cx = $t[$px]['chi']; $cx; $cx = $t[$cx]['sis']) {
        if ($t[$cx]['delta'] == $ch ) 
            return $cx;
        }
    return 0;
}

function showTable($t) {
    $hx = 0;
    $hd = [];
    foreach($t as $rk => $r)  {
        foreach($r as $k => $v)  {
            if( ! isset($hd[$k])) 
                $hd[$k] = ($hx++);
            }
        }
    echo "<table border=2><tr><th>id </th>";
    foreach ($hd as $h => $hn)
        echo "<th> $h </th>";
    foreach($t as $rk => $r)  {
        echo "</tr><tr><td> $rk </td>";
        foreach($hd as $h => $hn)  {
            echo '<td> ' . (isset($r[$h]) ? is_array($r[$h]) ? '#'.count($r[$h]) : $r[$h] : '---') . ' </td>';
        }
    }
    echo "</tr></table>";
}

function suffixTree($aSrc) { # create suffixTree for string $aSrc
    global $t, $src, $srE, $srL, $cUpdateFor, $cAncCx2Par, $trace, $suffixTreeInfo, $assertNo;
    $srL = strlen($src = $aSrc);
    $srE = $cUpdateFor = $cBoundary2Suf = $cAncCx2Par = 0;
    $chk = -99;
    trace("suffixTree len=" . strlen($src) . " src=$src", 1, 1);
    $t = [];
    $root = ndNew('-', 0, 0, 0);
    $lx = ndNew($root, 0, 0, $srL);        # last leaf on boundary path
    $loop=1e30;
    for ($srE=1; $srE <= $srL; $srE++) {
        if ($loop-- < 0) die('loop');        
        $ch = substr($src, $srE-1, 1);
        trace("expandTo srcEnd=srE=$srE ch=$ch lx" . nd2s($lx));
        $assertNo or $t[$lx]['cEn'] == $srL or assertSay("active point not to end lx" . nd2s($lx));
            # skip over final leaves pointing to final leaves
        for ($rx = $t[$lx]['cRo']; $lx and $rx < $srL and ($sx=$t[$lx]['suf']) and $t[$sx]['cEn'] == $srL and $rx + 1  == $t[$sx]['cRo']; $rx++) {
            trace("finalLeave from" .nd2s($lx) . " to" . nd2s($sx), 2);
            $lx = $sx;
            $cBoundary2Suf ++ ;
            }
        trace("expandToLastFinal srE=$srE lx=$lx ch=$ch");
        suffixTreeUpdate($lx, ndLen($lx, $srE));
        trace("expandToEnd srE=$srE lx=$lx ch=$ch");
        $assertNo or suffixTreeCheck($chk = $srE);        
        }
    if ($chk !== $srL)
        suffixTreeCheck($srL);    
    trace($suffixTreeInfo = "suffixTreeEnd len=" . strlen($src) . " treeCount=" . count($t) . " cBoundary2Suf=$cBoundary2Suf cUpdateFor=$cUpdateFor cAncCx2Par=$cAncCx2Par src=$src", 1, 2);
    return $t;
} # suffixTree end

function suffixTreeUpdate($bx, $le) { # extend unfinished suffixTree from length $le-1 to $le. $bx is the node for the complete source string
    global $t, $src, $srL, $srE, $loop, $cUpdateFor, $assertNo;

    $cE = $t[$bx]['cRo'] + $le;
    for ($sL=$le ; $bx; $bx=$sx) {         # walk from active point along boundary path
        if ($loop-- < 0) die('loop');            
        $cUpdateFor++;
        $sL--; # new len of suffix
        $b = &$t[$bx];
        $sw = $b['suf'];
        trace("for3 sL=$sL, b" . nd2s($bx) . " s" . nd2s($sw), 2);
        $assertNo or ndSufLenCheck($bx, $sL);
            
        $di = (ndLen($sw) < $sL or substr($src, $b['cRo'] + $sL, 1) != substr($src, $t[$sw]['cRo'] + $sL-1, 1)); # is suffix different for new len?
         $sx = suffixTreeAncCx($sw, $di ? $sL - 2 : ndLen($bx) - 1); #ancestor containing lastEqual character
        $s = &$t[$sx];
        $assertNo or ndSufSxLenCheck($bx, $sx, $sL);
        trace("ancestor" . ($sw === $sx ? '===' : '???') . "sx=$sx sw=$sw dB=dB ",11); 

 
        $py = 0;
        $sy = $sx;
        if ($sL < 1) {
            break;
            }
        elseif (ndLen($sx) < $sL) {
            if (! $sy = ndChild($sx, substr($src, $b['cRo'] + 1 + $s["cEn"] - $s["cRo"], 1))) { 
                $sy = ndNew($sx, $srE - $sL, ndLen($sx) + $srE - $sL, $srL);
                suffixTreeSufSet($sy, $s['suf']);
                $assertNo or ndSufEndCheck($sy, $srE-1);
                }
            }
        elseif (! $di) {
            if ($b['cEn'] > $srE or ndLen($sx) == ndLen($bx) - 1) {
                if ($sy == $sw) 
                    break;
                }
            else {
                $assertNo or $sL <= ndLen($sy) and $sL > ndPrL($sy) or assertSay("sL=$sL longer sy" . nd2s($sy));
                if ($sL < ndLen($sy)) {
                    $sz = ndSplit($sy, $sL);
                    if ($b['cEn'] == $srL or ndLen($bx) - 1 <= ndLen($sz) )
                        $sy = $sz;
                    }
                }
            }
        else { 
            $px = $sx;
            if ($sL-1 != ndLen($px))
                $py = $px = ndSplit($px, $sL-1);
            $sy = ndNew($px, $b['cRo']+1, ndLen($px) + 1 + $b['cRo'], $srL); 
            suffixTreeSufSet($sy, $s['suf']);
            trace("added3 ky" . nd2s($sy) . " py" . nd2s($py));
            }
        suffixTreeSufSet($bx, $sy);
        if ($sw !== $b['suf']) {
            $sx = $b['suf'];
            trace("Update4 bx=$bx sx" . nd2s($sx) . " par" . nd2s($t[$sx]['par']), 2);
            }
        $assertNo or ndSufEndCheck($bx, $srE);
        if ($py) 
            suffixTreeUpdate($py, ndLen($py));
        }
} # suffixTreeUpdate end

function suffixTreeCheck($cE) { #check unfinishe suffix tree for length $cE
    global $t, $src, $srL, $trace;
    if ($trace >= 1)
        say("updates for $cE " . ($cE < 12 ? substr($src, 0, $cE) : '...' . substr($src, $cE-10, 10)) . '... ' . suffixTree2Table($cE >= 3 ? $cE : 0));
    $cc = count($t);
    trace("checkTree curEnd=cE=$cE count $cc: boundary (0, $srL) ... (" . ($cE-1) . ", $srL)", 1, 5);
    $sx = 1;
    $boC = 0;
    $boLi = '';
    for ($cx=0; $cx < $cE; $cx++ ) {
        ndLen($sx, $cE) >= $cE-$cx and substr($src, $cx, $cE-$cx) == substr($src, $t[$sx]["cRo"], $cE-$cx)
            or assertSay("boundary $sx <-> suffix $cx");
        if ($t[$sx]["cRo"] != $cx or $t[$sx]["cEn"] != $srL or count($t[$sx]["chi"]) != 0) {
            $boC++;
            if (FALSE === strPos("$boLi,", ",$sx,")) 
                $boLi .= ",$sx";
            }
        $sx = $t[$sx]['suf'];
        }
    trace("temporaryBoundaries" . ( $boC ? " $boC (" . substr($boLi, 1) .")" : 'None'), 1, 5); # should be empty after stopper character
    trace("parent suffix", 1, 5);
    for ($nx=1; $nx < count($t); $nx++) {
        ndCheck($nx) + ndSufEndCheck($nx, $cE);
        }
    trace("treeStructure", 1, 5);
    $chi = [($px = 0) => 1];
    while (TRUE) {
        $px < $cc and isset($t[$px]) and $chi[$px] == 1 or assertSay('missing parent' . nd2s($px));
        $chi[$px] = 2;
        foreach ($t[$px]['chi'] as $ch => $cx) {
            strlen($ch) == 1 and $px == $t[$cx]["par"] and $ch == substr($src, $t[$cx]["cFi"], 1) or assertSay("bad chi ch=$ch par" . nd2s($px) . " cx" .nd2s($cx));
            ! isset($chi[$cx]) or assertSay("duplicate child $cx in par $px");
            $chi[$cx] = 1;
            }
        foreach($chi as $px => $v) {
            if ($v == 1)
                continue 2;
            }    
        break;
        }
    for ($nx=1; $nx < count($t); $nx++) {
        isset($chi[$nx]) or assertSay("missing child $nx");
        }    
    trace("endCheck $cE", 1, 5);
} # suffixTreeCheck end


function suffixTreeAncCx($nx, $ox) { # find ancestor of $ox't character of suffix in delta
    global $t, $cAncCx2Par, $assertNo;
    for ($ax=$nx; $ax > 0 and $ox < ndPrL($ax); $ax=$t[$ax]['par']) {
        $cAncCx2Par ++ ;
        trace("ancCxDown ox=$ox from" . nd2s($nx) . " to" . nd2s($ax), 2);
        } 
    $assertNo or ($ox < 0 and $ax === 0) or ($ox >= ndPrL($ax) and ($ox < ndLen($ax) or $ax === $nx)) or assertSay("ox=$ox not in delta of ax" . nd2s($ax) . " nx" . nd2s($nx));
    return $ax;
}

function suffixTreeSufSet($nx, $sx) { # set suf from ancestor of $sx with offset $ox in delta
    global $t, $src, $srE, $assertNo;
    $sy = suffixTreeAncCx($sx, ndLen($nx) -2);
    if ($sy != $t[$nx]['suf']) {
        $t[$nx]['suf'] = $sy;
        trace("SufSet nx" .nd2s($nx) . " sy" . nd2s($sy), 12);
        $t[$nx]['upd'] = $srE;
        }
    $assertNo or ndSufLenCheck($nx, ndLen($nx, $srE - 1));
}

function ndNew($px, $cRo, $cFi, $cEn) { # create a new node: suffix = substr($src, $cRo, $cEn-$cRo), delta = substr($src, $cFi, $cFi-$cRo)
    global $t, $src, $srE, $assertNo;
    $nx = count($t);
    $t[$nx] = ['par' => '-', 'cRo' => $cRo, 'cFi' => $cFi, 'cEn' => $cEn, 'suf' => 0, 'chi' => [], 'cre' => $srE, 'upd' => $srE];
    if ($px !== '-') 
        ndChildAdd($px, $nx);
    $assertNo or ndCheck($nx);
    trace("added nx". nd2s($nx) . " px" . ($px === '-' ? $px : nd2s($px)), 12);
    return $nx;
}

function nd2s($nx) { # show node $nx as string
    global $t, $src;
    if (! isset($t[$nx]))
        return "====[id=$nx notIn t]";
    $n = &$t[$nx];
    return "===[id=>$nx, " .a2s($t[$nx]) . ", delta=>" . substr($src, $t[$nx]['cFi'], $t[$nx]['cEn'] - $t[$nx]['cFi']) . " suf=>" . ndSuf($nx) ."]";
}


function ndLen($nx, $cE=9e9) { # return length of suffix of node, if second parameter specified only to the currentEnd $cE of $src
    global $t, $src, $srL, $assertNo;
    $le = min($t[$nx]['cEn'], $cE) - $t[$nx]['cRo'];
    $assertNo or $le >= 0 and $le <= $srL or assertSay('bad len $le for ce=$cE in nx' . nd2s($nx));
    return $le;
}

function ndSuf($nx, $cE=9e9) { # return suffix of node $nx, if second parameter specified only to the currentEnd $cE of $src
    global $t, $src, $assertNo;
    $assertNo or $nx !== '-' or assertSay("bad nx=$nx");
    return substr($src, $t[$nx]['cRo'], ndLen($nx, $cE));
}

function ndPrL($nx) { # length of prefix of current path: from root to parent node
    global $t, $src, $assertNo;
    $le = $t[$nx]['cFi'] - $t[$nx]['cRo'];
    $assertNo or $nx >= 0 and $le >= 0 and ($nx === 0 or substr($src, $t[$nx]["cRo"], $le) === ndSuf($t[$nx]["par"])) 
        or assertSay("badPrefix le=$e nx" . nd2s($nx) . ' par' . nd2s($t[$nx]["par"]));   
    return $le;
}

function ndPre($nx) { # prefix of current path: from root to parent node
    global $t, $src;
    return substr($src, $t[$nx]['cRo'], ndPrL($nx));
}


function ndChildAdd($px, $cx) { # add to parent $px the child $cx
    global $t, $src, $srE, $assertNo;
    $ch = substr($src, $t[$cx]['cFi'], 1);
    $assertNo or $cx > 0 and $t[$cx]["par"] == "-" and ! isset($t[$px]["chi"][$ch]) or assertSay("ch=$ch bad orphan" .nd2s($cx) . " for parent" . nd2s($px));
    $assertNo or ndLen($px) < ndLen($cx) and ndLen($px) == $t[$cx]["cFi"] - $t[$cx]["cRo"] and ndSuf($px) == substr($src, $t[$cx]["cRo"], ndLen($px))
              or assertSay("prefix mismatch orphan" . nd2s($cx) . " for parent" . nd2s($px));
    $t[$cx]['par'] = $px;
    $t[$cx]['upd'] = $srE; 
    $t[$px]['chi'][$ch] = $cx;
    $t[$px]['upd'] = $srE; 
    $assertNo or ndCheck($cx);
}

function ndChild($px, $pre) { # find a child of $px starting with $pre
    global $t, $src;
    if (isset($t[$px]["chi"][substr($pre, 0, 1)])) {
        $cx = $t[$px]["chi"][substr($pre, 0, 1)];
        if (substr($src, $t[$cx]['cFi'], strlen($pre)) == $pre ) 
            return $cx;
        }
    return 0;
}

function ndChildRm($cx) { #remove child $cx from it's parent
    global $t, $src, $srE, $assertNo;
    $px = $t[$cx]['par'];
    $ch = substr($src, $t[$cx]["cFi"], 1);
    $assertNo or ($cx > 0) and ($px !== "-") and ($t[$px]["chi"][$ch] == $cx) or assertSay("bad child" . nd2s($cx) . " in parent" . nd2s($px));
    unset($t[$px]["chi"][$ch]);
    $t[$px]['upd'] = $srE; 
    $t[$cx]['par'] = '-';
    $t[$cx]['upd'] = $srE; 
}
    
function ndSplit($nx, $le) { ##### split node $nx, by adding a new parent of len $le. return new parent #####
    global $t, $assertNo;
    $n = &$t[$nx];
    $px = $n['par'];
    $assertNo or $le < ndLen($nx) and $le > ndPrL($nx) or assertSay("bad le $le for split" .nd2s(nx));
    ndChildRm($nx);
    $py = ndNew($px, $n['cRo'], $n['cFi'], $n['cRo'] + $le);
    $n['cFi'] = $t[$py]['cEn'];
    ndChildAdd($py, $nx);
    suffixTreeSufSet($py, $n['suf']);
    trace("splitDone len=$le nx" . nd2s($nx) . " py" . nd2s($py) . " px" . nd2s($px));
    $assertNo or ndCheck($nx) + ndCheck($py) + ndSufLenCheck($py, ndLen($py));
    return $py;
} #ndSplit 

function ndCheck($nx) { #check nd: src Indexes, parent link and prefix
    global $t, $src;
    $n = $t[$nx];
    $px = $n['par'];
    $e = '';
    if ($nx <= 0 ) {
        if ($nx !== 0 or $n['cRo'] !== 0 or $n['cFi'] !== 0 or $n['cEn'] !== 0 or $n['par'] !== '-')        
            $e = 'badRoot';
        }
    elseif ($n['cRo'] < 0 or $n['cFi'] < $n['cRo'] or $n['cEn'] <= $n['cFi'] or $n['par'] === '-')
        $e = 'badIndexes';
     elseif (! isset($t[$px])) 
        $e = 'parentNotInTree';
    elseif (ndLen($px) !== ndPrL($nx) or ndSuf($px) !== ndPre($nx))
        $e = 'prefixMismatch';
    elseif ( ndChild($px, $src[$n['cFi']]) !== $nx)
        $e = 'mismatch->parent->child';
    if ($e !== '')
        assertSay("$e nx" . nd2s($nx) . " parent" . nd2s($px));
} #ndCheck end

function ndSufEndCheck($nx, $cE) { #check suffix for end $cE
    global $t, $src, $srL;
    ndSufLenCheck($nx, ndLen($nx, $cE));
    ndLen($nx, $cE) <  ndLen($nx) or ndLen($nx) == ndLen($t[$nx]['suf']) + 1 
        or assertSay("sufEndCheck closedSuffixLengthMismatch srL=$srL cE=$cE nx" . nd2s($nx) . ' sx' . nd2s($t[$nx]['suf']));
}

function ndSufLenCheck($nx, $le) { # check suffix for len $le
    global $t;
    return ndSufSxLenCheck($nx, $t[$nx]['suf'], $le);
}

function ndSufSxLenCheck($nx, $sx, $le) { # check suffix for len $le
    global $t, $src, $srL;
    $e = '';
    if ($le === 0)
        return;
    elseif (ndLen($nx) < $le or ndLen($sx) < $le - 1)
        $e = 'tooShort';
    elseif (substr($src, $t[$nx]['cRo']+1, $le-1) !== substr($src, $t[$sx]['cRo'], $le-1))
        $e = 'sufMismatch';
    else
        return;
    assertSay("sufLenCheck $e le=$le nx" . nd2s($nx) . ' sx' . nd2s($sx) . backtrace());
}

function suffixTree2Table($up=0) {
    global $t, $src;
    $fr = FALSE;
    $s = '';
    foreach($t as $id => $r)  {
        if ($r['upd'] < $up)
            continue;
        if (! $fr) {
            $fr = $r;
            $s .= "<table border=2><tr><th>id </th>";
            foreach ($fr as $k => $v)
                $s .= "<th> $k </th>";
            $s .= "<th>delta</th><th>suffix</th>";
            }
        $s .= "</tr><tr><td>$id</td>";
        foreach($fr as $h => $hn) 
            $s .= '<td> ' . (isset($r[$h]) ? is_array($r[$h]) ? '#'.count($r[$h]) : $r[$h] : '---') . ' </td>';
        $s .= '<td>' . substr($src, $r['cFi'], $r['cEn'] - $r['cFi']) . '</td><td>' . ndSuf($id) .'</td>';
        }
    if ($fr)
        return "$s </tr></table>";
    else
        return "no rows updated >= $up";
}

function a2s($a) {
    $r = '';
    foreach($a as $k => $v)  {
        $r .= ", $k" . (is_array($v) ? "=>[" . count($v) . ': ' . a2s($v) . ']' : "=>$v");
        }
    return substr($r, 2);
}

function trace($tx, $l=10, $fu=0) {
    global $trace;
    $trace < $l or say($tx, $fu);
}

function traceSet($tr) {
    global $trace, $assert, $assertNo;
    
      $trace = max(min($tr + 0, 99), 0);
    $assert = $trace;
    $assertNo = $trace <= 0;
}

function assertSay($txt) {
    say("assert failed: $txt " . backtrace());
}

function say($tx, $fu=0) {
    global $trace;
    if ($fu==1)
        echo "<ol><li>$tx";
    elseif ($fu == 0)
        echo "</li><li>$tx";
    elseif ($fu == 2)
        echo "</li><li>$tx</li></ol>";
    elseif ($fu == 4)
        echo "$tx";
    elseif ($fu == 5)
        echo ", $tx";
    else
        echo "sayBadFun $fu $tx";
}

function backtrace($num=7) { // returns backtrace, html formatted
        ob_start();
        debug_print_backtrace(0 , $num);
        for ($tr = ob_get_contents(); substr($tr, strlen($tr)-strlen(PHP_EOL)) == PHP_EOL; $tr = substr($tr, 0, strlen($tr)-strlen(PHP_EOL))) {}
        ob_end_clean();
        return "<ul><li>" . strtr($tr, [PHP_EOL => "</li><li>"]) . "</li></ul>";
}

function highlightNum($fi) { # highlight file with lineNumbers
    $s = highlight_file($fi, true);
    $ln = 1;
    $cy = strpos($s, '<br />');
    if (false !== ($sx = strpos($s, '<span')) and false !== ($sy = strpos($s, '>', $sx)) and $sy < $cy) { # lineNo 1 after first span and possibly \n which must remain after first span
        $cx = $sy + 1 + (substr($s, $sy, 2) === ">\n");
        $r = substr($s, 0, $cx) . highlightNum1($ln) . substr($s, $cx, $cy + 6 -$cx);
        }
    else {
        $r = '*** code does not have a span berfore first &lt;br&gt;***' . substr($s, 0, $cy + 6);
        }
    
    while (FALSE !== ($cy = strpos($s, '<br />', $cx=$cy+6))) {
        $r .= highlightNum1(++$ln) . substr($s, $cx, $cy + 6 - $cx);
        } 
    return $r . substr($s, $cx);
}

function highlightNum1($ln) { # format one lineNo
    return '<span style="color: #ffffff; background-color: #a0e0a0;">' . strtr(sprintf('%6u  ', $ln), [' ' => '&nbsp;']) . '</span>&nbsp;&nbsp;';
}
?>