php/suffixTree.php
<form action="<?php echo basename(__file__); ?>" method="post">
<h2><?php echo basename(__file__); ?> compute suffix tree für String</h3>
<?php
$src = isset($_POST['string']) ? $_POST['string'] : 'kakao';
traceSet(isset($_POST['trace']) ? $_POST['trace'] : 1);
$withTrie = isset($_POST['trie']);
?>
<p>string: <input type="text" name="string" size="100" value=<?php echo '"' . $src . '"'; ?> /></p>
<p><input type="checkbox" name="trie" <?php if ($withTrie) echo "checked"; ?> />also suffixTrie
<input type="input" name="trace" maxlength=2 size=2 value="<?php echo $trace; ?>" />trace
<p><input type="submit" value="compute tree" /></p>
<?php if ($withTrie) {
echo "<h2> suffix trie für $src </h2>";
$t = suffixTrie($src);
echo "srcLen=" . strLen($src) . " treeCount=" . count($t);
showTable($t);
}
?>
<h2> suffix tree für <?php echo "len=" . strlen($src) . ": $src"; ?> </h2>
<?php
suffixTree($src);
say($suffixTreeInfo . suffixTree2Table(), 4);
?>
<h2>Source</h2>
<?php echo highlightNum(__file__); ?>
</form>
<?php
function suffixTrie($src) {
$fx = 0;
trace("start of suffixTrie $src", 1, 1);
$t = [];
suffixTrieAdd($t, 0, '', 0, 0);
$t[0]['substr'] = $src;
for ($cy=0; $cy < strlen($src); $cy++) {
$ch = substr($src, $cy, 1);
$fx = $nx = suffixTrieAdd($t, $fx, 0, $cy); #node for longest substring
trace("for1 cy=$cy nx=$nx");
for ($cx=1; $t[$nx]['par']; $cx++) { # handle all suffixes
trace("for2 cy=$cy nx=$nx cx=$cx");
$st = $t[$t[$nx]['par']]['suf']; # suffix of parent
if ($sx = suffixTrieChild($t, $st, $ch)) {
$t[$nx]['suf'] = $sx; # found follower character - remaining boundary path ok
break;
}
$nx = $t[$nx]['suf'] = suffixTrieAdd($t, $st, $cx, $cy); # add current character
}
}
trace("end of suffixTrie $src", 1, 2);
return $t;
}
function suffixTrieAdd(&$t, $px, $cx, $cy) {
$nx = count($t);
trace("add nx=$nx cx=$cx $cy");
$t[$nx] = ['par' => $px, 'cFi' => $cx, 'cLa' => $cy, 'suf' => 0, 'chi' => 0];
$n = &$t[$nx];
if ($nx) {
$src = $t[0]['substr'];
$n['sis'] = $t[$px]['chi'];
$t[$px]['chi'] = $nx;
$n['substr'] = str_repeat('_', $cx) . substr($src, $cx, $cy + 1 - $cx);
$n['delta'] = substr($src, $cy, 1);
}
else {
$n['sis'] = 0;
$n['substr'] = $n['delta'] = '';
}
return $nx;
}
function suffixTrieChild(&$t, $px, $ch) {
for ($cx = $t[$px]['chi']; $cx; $cx = $t[$cx]['sis']) {
if ($t[$cx]['delta'] == $ch )
return $cx;
}
return 0;
}
function showTable($t) {
$hx = 0;
$hd = [];
foreach($t as $rk => $r) {
foreach($r as $k => $v) {
if( ! isset($hd[$k]))
$hd[$k] = ($hx++);
}
}
echo "<table border=2><tr><th>id </th>";
foreach ($hd as $h => $hn)
echo "<th> $h </th>";
foreach($t as $rk => $r) {
echo "</tr><tr><td> $rk </td>";
foreach($hd as $h => $hn) {
echo '<td> ' . (isset($r[$h]) ? is_array($r[$h]) ? '#'.count($r[$h]) : $r[$h] : '---') . ' </td>';
}
}
echo "</tr></table>";
}
function suffixTree($aSrc) { # create suffixTree for string $aSrc
global $t, $src, $srE, $srL, $cUpdateFor, $cAncCx2Par, $trace, $suffixTreeInfo, $assertNo;
$srL = strlen($src = $aSrc);
$srE = $cUpdateFor = $cBoundary2Suf = $cAncCx2Par = 0;
$chk = -99;
trace("suffixTree len=" . strlen($src) . " src=$src", 1, 1);
$t = [];
$root = ndNew('-', 0, 0, 0);
$lx = ndNew($root, 0, 0, $srL); # last leaf on boundary path
$loop=1e30;
for ($srE=1; $srE <= $srL; $srE++) {
if ($loop-- < 0) die('loop');
$ch = substr($src, $srE-1, 1);
trace("expandTo srcEnd=srE=$srE ch=$ch lx" . nd2s($lx));
$assertNo or $t[$lx]['cEn'] == $srL or assertSay("active point not to end lx" . nd2s($lx));
# skip over final leaves pointing to final leaves
for ($rx = $t[$lx]['cRo']; $lx and $rx < $srL and ($sx=$t[$lx]['suf']) and $t[$sx]['cEn'] == $srL and $rx + 1 == $t[$sx]['cRo']; $rx++) {
trace("finalLeave from" .nd2s($lx) . " to" . nd2s($sx), 2);
$lx = $sx;
$cBoundary2Suf ++ ;
}
trace("expandToLastFinal srE=$srE lx=$lx ch=$ch");
suffixTreeUpdate($lx, ndLen($lx, $srE));
trace("expandToEnd srE=$srE lx=$lx ch=$ch");
$assertNo or suffixTreeCheck($chk = $srE);
}
if ($chk !== $srL)
suffixTreeCheck($srL);
trace($suffixTreeInfo = "suffixTreeEnd len=" . strlen($src) . " treeCount=" . count($t) . " cBoundary2Suf=$cBoundary2Suf cUpdateFor=$cUpdateFor cAncCx2Par=$cAncCx2Par src=$src", 1, 2);
return $t;
} # suffixTree end
function suffixTreeUpdate($bx, $le) { # extend unfinished suffixTree from length $le-1 to $le. $bx is the node for the complete source string
global $t, $src, $srL, $srE, $loop, $cUpdateFor, $assertNo;
$cE = $t[$bx]['cRo'] + $le;
for ($sL=$le ; $bx; $bx=$sx) { # walk from active point along boundary path
if ($loop-- < 0) die('loop');
$cUpdateFor++;
$sL--; # new len of suffix
$b = &$t[$bx];
$sw = $b['suf'];
trace("for3 sL=$sL, b" . nd2s($bx) . " s" . nd2s($sw), 2);
$assertNo or ndSufLenCheck($bx, $sL);
$di = (ndLen($sw) < $sL or substr($src, $b['cRo'] + $sL, 1) != substr($src, $t[$sw]['cRo'] + $sL-1, 1)); # is suffix different for new len?
$sx = suffixTreeAncCx($sw, $di ? $sL - 2 : ndLen($bx) - 1); #ancestor containing lastEqual character
$s = &$t[$sx];
$assertNo or ndSufSxLenCheck($bx, $sx, $sL);
trace("ancestor" . ($sw === $sx ? '===' : '???') . "sx=$sx sw=$sw dB=dB ",11);
$py = 0;
$sy = $sx;
if ($sL < 1) {
break;
}
elseif (ndLen($sx) < $sL) {
if (! $sy = ndChild($sx, substr($src, $b['cRo'] + 1 + $s["cEn"] - $s["cRo"], 1))) {
$sy = ndNew($sx, $srE - $sL, ndLen($sx) + $srE - $sL, $srL);
suffixTreeSufSet($sy, $s['suf']);
$assertNo or ndSufEndCheck($sy, $srE-1);
}
}
elseif (! $di) {
if ($b['cEn'] > $srE or ndLen($sx) == ndLen($bx) - 1) {
if ($sy == $sw)
break;
}
else {
$assertNo or $sL <= ndLen($sy) and $sL > ndPrL($sy) or assertSay("sL=$sL longer sy" . nd2s($sy));
if ($sL < ndLen($sy)) {
$sz = ndSplit($sy, $sL);
if ($b['cEn'] == $srL or ndLen($bx) - 1 <= ndLen($sz) )
$sy = $sz;
}
}
}
else {
$px = $sx;
if ($sL-1 != ndLen($px))
$py = $px = ndSplit($px, $sL-1);
$sy = ndNew($px, $b['cRo']+1, ndLen($px) + 1 + $b['cRo'], $srL);
suffixTreeSufSet($sy, $s['suf']);
trace("added3 ky" . nd2s($sy) . " py" . nd2s($py));
}
suffixTreeSufSet($bx, $sy);
if ($sw !== $b['suf']) {
$sx = $b['suf'];
trace("Update4 bx=$bx sx" . nd2s($sx) . " par" . nd2s($t[$sx]['par']), 2);
}
$assertNo or ndSufEndCheck($bx, $srE);
if ($py)
suffixTreeUpdate($py, ndLen($py));
}
} # suffixTreeUpdate end
function suffixTreeCheck($cE) { #check unfinishe suffix tree for length $cE
global $t, $src, $srL, $trace;
if ($trace >= 1)
say("updates for $cE " . ($cE < 12 ? substr($src, 0, $cE) : '...' . substr($src, $cE-10, 10)) . '... ' . suffixTree2Table($cE >= 3 ? $cE : 0));
$cc = count($t);
trace("checkTree curEnd=cE=$cE count $cc: boundary (0, $srL) ... (" . ($cE-1) . ", $srL)", 1, 5);
$sx = 1;
$boC = 0;
$boLi = '';
for ($cx=0; $cx < $cE; $cx++ ) {
ndLen($sx, $cE) >= $cE-$cx and substr($src, $cx, $cE-$cx) == substr($src, $t[$sx]["cRo"], $cE-$cx)
or assertSay("boundary $sx <-> suffix $cx");
if ($t[$sx]["cRo"] != $cx or $t[$sx]["cEn"] != $srL or count($t[$sx]["chi"]) != 0) {
$boC++;
if (FALSE === strPos("$boLi,", ",$sx,"))
$boLi .= ",$sx";
}
$sx = $t[$sx]['suf'];
}
trace("temporaryBoundaries" . ( $boC ? " $boC (" . substr($boLi, 1) .")" : 'None'), 1, 5); # should be empty after stopper character
trace("parent suffix", 1, 5);
for ($nx=1; $nx < count($t); $nx++) {
ndCheck($nx) + ndSufEndCheck($nx, $cE);
}
trace("treeStructure", 1, 5);
$chi = [($px = 0) => 1];
while (TRUE) {
$px < $cc and isset($t[$px]) and $chi[$px] == 1 or assertSay('missing parent' . nd2s($px));
$chi[$px] = 2;
foreach ($t[$px]['chi'] as $ch => $cx) {
strlen($ch) == 1 and $px == $t[$cx]["par"] and $ch == substr($src, $t[$cx]["cFi"], 1) or assertSay("bad chi ch=$ch par" . nd2s($px) . " cx" .nd2s($cx));
! isset($chi[$cx]) or assertSay("duplicate child $cx in par $px");
$chi[$cx] = 1;
}
foreach($chi as $px => $v) {
if ($v == 1)
continue 2;
}
break;
}
for ($nx=1; $nx < count($t); $nx++) {
isset($chi[$nx]) or assertSay("missing child $nx");
}
trace("endCheck $cE", 1, 5);
} # suffixTreeCheck end
function suffixTreeAncCx($nx, $ox) { # find ancestor of $ox't character of suffix in delta
global $t, $cAncCx2Par, $assertNo;
for ($ax=$nx; $ax > 0 and $ox < ndPrL($ax); $ax=$t[$ax]['par']) {
$cAncCx2Par ++ ;
trace("ancCxDown ox=$ox from" . nd2s($nx) . " to" . nd2s($ax), 2);
}
$assertNo or ($ox < 0 and $ax === 0) or ($ox >= ndPrL($ax) and ($ox < ndLen($ax) or $ax === $nx)) or assertSay("ox=$ox not in delta of ax" . nd2s($ax) . " nx" . nd2s($nx));
return $ax;
}
function suffixTreeSufSet($nx, $sx) { # set suf from ancestor of $sx with offset $ox in delta
global $t, $src, $srE, $assertNo;
$sy = suffixTreeAncCx($sx, ndLen($nx) -2);
if ($sy != $t[$nx]['suf']) {
$t[$nx]['suf'] = $sy;
trace("SufSet nx" .nd2s($nx) . " sy" . nd2s($sy), 12);
$t[$nx]['upd'] = $srE;
}
$assertNo or ndSufLenCheck($nx, ndLen($nx, $srE - 1));
}
function ndNew($px, $cRo, $cFi, $cEn) { # create a new node: suffix = substr($src, $cRo, $cEn-$cRo), delta = substr($src, $cFi, $cFi-$cRo)
global $t, $src, $srE, $assertNo;
$nx = count($t);
$t[$nx] = ['par' => '-', 'cRo' => $cRo, 'cFi' => $cFi, 'cEn' => $cEn, 'suf' => 0, 'chi' => [], 'cre' => $srE, 'upd' => $srE];
if ($px !== '-')
ndChildAdd($px, $nx);
$assertNo or ndCheck($nx);
trace("added nx". nd2s($nx) . " px" . ($px === '-' ? $px : nd2s($px)), 12);
return $nx;
}
function nd2s($nx) { # show node $nx as string
global $t, $src;
if (! isset($t[$nx]))
return "====[id=$nx notIn t]";
$n = &$t[$nx];
return "===[id=>$nx, " .a2s($t[$nx]) . ", delta=>" . substr($src, $t[$nx]['cFi'], $t[$nx]['cEn'] - $t[$nx]['cFi']) . " suf=>" . ndSuf($nx) ."]";
}
function ndLen($nx, $cE=9e9) { # return length of suffix of node, if second parameter specified only to the currentEnd $cE of $src
global $t, $src, $srL, $assertNo;
$le = min($t[$nx]['cEn'], $cE) - $t[$nx]['cRo'];
$assertNo or $le >= 0 and $le <= $srL or assertSay('bad len $le for ce=$cE in nx' . nd2s($nx));
return $le;
}
function ndSuf($nx, $cE=9e9) { # return suffix of node $nx, if second parameter specified only to the currentEnd $cE of $src
global $t, $src, $assertNo;
$assertNo or $nx !== '-' or assertSay("bad nx=$nx");
return substr($src, $t[$nx]['cRo'], ndLen($nx, $cE));
}
function ndPrL($nx) { # length of prefix of current path: from root to parent node
global $t, $src, $assertNo;
$le = $t[$nx]['cFi'] - $t[$nx]['cRo'];
$assertNo or $nx >= 0 and $le >= 0 and ($nx === 0 or substr($src, $t[$nx]["cRo"], $le) === ndSuf($t[$nx]["par"]))
or assertSay("badPrefix le=$e nx" . nd2s($nx) . ' par' . nd2s($t[$nx]["par"]));
return $le;
}
function ndPre($nx) { # prefix of current path: from root to parent node
global $t, $src;
return substr($src, $t[$nx]['cRo'], ndPrL($nx));
}
function ndChildAdd($px, $cx) { # add to parent $px the child $cx
global $t, $src, $srE, $assertNo;
$ch = substr($src, $t[$cx]['cFi'], 1);
$assertNo or $cx > 0 and $t[$cx]["par"] == "-" and ! isset($t[$px]["chi"][$ch]) or assertSay("ch=$ch bad orphan" .nd2s($cx) . " for parent" . nd2s($px));
$assertNo or ndLen($px) < ndLen($cx) and ndLen($px) == $t[$cx]["cFi"] - $t[$cx]["cRo"] and ndSuf($px) == substr($src, $t[$cx]["cRo"], ndLen($px))
or assertSay("prefix mismatch orphan" . nd2s($cx) . " for parent" . nd2s($px));
$t[$cx]['par'] = $px;
$t[$cx]['upd'] = $srE;
$t[$px]['chi'][$ch] = $cx;
$t[$px]['upd'] = $srE;
$assertNo or ndCheck($cx);
}
function ndChild($px, $pre) { # find a child of $px starting with $pre
global $t, $src;
if (isset($t[$px]["chi"][substr($pre, 0, 1)])) {
$cx = $t[$px]["chi"][substr($pre, 0, 1)];
if (substr($src, $t[$cx]['cFi'], strlen($pre)) == $pre )
return $cx;
}
return 0;
}
function ndChildRm($cx) { #remove child $cx from it's parent
global $t, $src, $srE, $assertNo;
$px = $t[$cx]['par'];
$ch = substr($src, $t[$cx]["cFi"], 1);
$assertNo or ($cx > 0) and ($px !== "-") and ($t[$px]["chi"][$ch] == $cx) or assertSay("bad child" . nd2s($cx) . " in parent" . nd2s($px));
unset($t[$px]["chi"][$ch]);
$t[$px]['upd'] = $srE;
$t[$cx]['par'] = '-';
$t[$cx]['upd'] = $srE;
}
function ndSplit($nx, $le) { ##### split node $nx, by adding a new parent of len $le. return new parent #####
global $t, $assertNo;
$n = &$t[$nx];
$px = $n['par'];
$assertNo or $le < ndLen($nx) and $le > ndPrL($nx) or assertSay("bad le $le for split" .nd2s(nx));
ndChildRm($nx);
$py = ndNew($px, $n['cRo'], $n['cFi'], $n['cRo'] + $le);
$n['cFi'] = $t[$py]['cEn'];
ndChildAdd($py, $nx);
suffixTreeSufSet($py, $n['suf']);
trace("splitDone len=$le nx" . nd2s($nx) . " py" . nd2s($py) . " px" . nd2s($px));
$assertNo or ndCheck($nx) + ndCheck($py) + ndSufLenCheck($py, ndLen($py));
return $py;
} #ndSplit
function ndCheck($nx) { #check nd: src Indexes, parent link and prefix
global $t, $src;
$n = $t[$nx];
$px = $n['par'];
$e = '';
if ($nx <= 0 ) {
if ($nx !== 0 or $n['cRo'] !== 0 or $n['cFi'] !== 0 or $n['cEn'] !== 0 or $n['par'] !== '-')
$e = 'badRoot';
}
elseif ($n['cRo'] < 0 or $n['cFi'] < $n['cRo'] or $n['cEn'] <= $n['cFi'] or $n['par'] === '-')
$e = 'badIndexes';
elseif (! isset($t[$px]))
$e = 'parentNotInTree';
elseif (ndLen($px) !== ndPrL($nx) or ndSuf($px) !== ndPre($nx))
$e = 'prefixMismatch';
elseif ( ndChild($px, $src[$n['cFi']]) !== $nx)
$e = 'mismatch->parent->child';
if ($e !== '')
assertSay("$e nx" . nd2s($nx) . " parent" . nd2s($px));
} #ndCheck end
function ndSufEndCheck($nx, $cE) { #check suffix for end $cE
global $t, $src, $srL;
ndSufLenCheck($nx, ndLen($nx, $cE));
ndLen($nx, $cE) < ndLen($nx) or ndLen($nx) == ndLen($t[$nx]['suf']) + 1
or assertSay("sufEndCheck closedSuffixLengthMismatch srL=$srL cE=$cE nx" . nd2s($nx) . ' sx' . nd2s($t[$nx]['suf']));
}
function ndSufLenCheck($nx, $le) { # check suffix for len $le
global $t;
return ndSufSxLenCheck($nx, $t[$nx]['suf'], $le);
}
function ndSufSxLenCheck($nx, $sx, $le) { # check suffix for len $le
global $t, $src, $srL;
$e = '';
if ($le === 0)
return;
elseif (ndLen($nx) < $le or ndLen($sx) < $le - 1)
$e = 'tooShort';
elseif (substr($src, $t[$nx]['cRo']+1, $le-1) !== substr($src, $t[$sx]['cRo'], $le-1))
$e = 'sufMismatch';
else
return;
assertSay("sufLenCheck $e le=$le nx" . nd2s($nx) . ' sx' . nd2s($sx) . backtrace());
}
function suffixTree2Table($up=0) {
global $t, $src;
$fr = FALSE;
$s = '';
foreach($t as $id => $r) {
if ($r['upd'] < $up)
continue;
if (! $fr) {
$fr = $r;
$s .= "<table border=2><tr><th>id </th>";
foreach ($fr as $k => $v)
$s .= "<th> $k </th>";
$s .= "<th>delta</th><th>suffix</th>";
}
$s .= "</tr><tr><td>$id</td>";
foreach($fr as $h => $hn)
$s .= '<td> ' . (isset($r[$h]) ? is_array($r[$h]) ? '#'.count($r[$h]) : $r[$h] : '---') . ' </td>';
$s .= '<td>' . substr($src, $r['cFi'], $r['cEn'] - $r['cFi']) . '</td><td>' . ndSuf($id) .'</td>';
}
if ($fr)
return "$s </tr></table>";
else
return "no rows updated >= $up";
}
function a2s($a) {
$r = '';
foreach($a as $k => $v) {
$r .= ", $k" . (is_array($v) ? "=>[" . count($v) . ': ' . a2s($v) . ']' : "=>$v");
}
return substr($r, 2);
}
function trace($tx, $l=10, $fu=0) {
global $trace;
$trace < $l or say($tx, $fu);
}
function traceSet($tr) {
global $trace, $assert, $assertNo;
$trace = max(min($tr + 0, 99), 0);
$assert = $trace;
$assertNo = $trace <= 0;
}
function assertSay($txt) {
say("assert failed: $txt " . backtrace());
}
function say($tx, $fu=0) {
global $trace;
if ($fu==1)
echo "<ol><li>$tx";
elseif ($fu == 0)
echo "</li><li>$tx";
elseif ($fu == 2)
echo "</li><li>$tx</li></ol>";
elseif ($fu == 4)
echo "$tx";
elseif ($fu == 5)
echo ", $tx";
else
echo "sayBadFun $fu $tx";
}
function backtrace($num=7) { // returns backtrace, html formatted
ob_start();
debug_print_backtrace(0 , $num);
for ($tr = ob_get_contents(); substr($tr, strlen($tr)-strlen(PHP_EOL)) == PHP_EOL; $tr = substr($tr, 0, strlen($tr)-strlen(PHP_EOL))) {}
ob_end_clean();
return "<ul><li>" . strtr($tr, [PHP_EOL => "</li><li>"]) . "</li></ul>";
}
function highlightNum($fi) { # highlight file with lineNumbers
$s = highlight_file($fi, true);
$ln = 1;
$cy = strpos($s, '<br />');
if (false !== ($sx = strpos($s, '<span')) and false !== ($sy = strpos($s, '>', $sx)) and $sy < $cy) { # lineNo 1 after first span and possibly \n which must remain after first span
$cx = $sy + 1 + (substr($s, $sy, 2) === ">\n");
$r = substr($s, 0, $cx) . highlightNum1($ln) . substr($s, $cx, $cy + 6 -$cx);
}
else {
$r = '*** code does not have a span berfore first <br>***' . substr($s, 0, $cy + 6);
}
while (FALSE !== ($cy = strpos($s, '<br />', $cx=$cy+6))) {
$r .= highlightNum1(++$ln) . substr($s, $cx, $cy + 6 - $cx);
}
return $r . substr($s, $cx);
}
function highlightNum1($ln) { # format one lineNo
return '<span style="color: #ffffff; background-color: #a0e0a0;">' . strtr(sprintf('%6u ', $ln), [' ' => ' ']) . '</span> ';
}
?>