php/urlHref.php

<?php
/*******************************************************************************
urlHref.php: read url, analyze HTML for <a href

loop
    read an url (i.e. stream) from queue
    analyze it as http: DOMDocument::loadHTML
    find all <a href=
    queue these href-url, 
        if they conform (e.g. same host, no query ....) and not already queued

report stats and times used

problems:
    could not get Connection: keep-alive (not a stream option?)
    we get HTTP/1.1 404 Not Found, and not create page as in firefox
*******************************************************************************/

require_once('env.php');
outBegin(basename(__file__));
$mtS = microtime(1);
$mtG = $mtP = 0;
$u0 = 'https://localhost';
$u0 = 'https://www.wlkl.ch';
$uMsk = "%^$u0/(?!.*RecentChan)[^?]+$%";
$que = ["$u0/index.php"];
$u2f = [$que[0] => 0];
# $ctx = stream_context_set_default(['https' => ['method'=>'GET', 'header' => ['Connection: keep-alive']]]);
# $a2strLevel=4;
# out('context', stream_context_get_params($ctx));
for ($wx=0; $wx < count($que) and $wx < 5; $wx++) {
    $url = $que[$wx]; 
    out($url, 'from', $u2f[$url] ?? '-');
    outOL(); 
    error_clear_last();
    $mtG -= microtime(1);
    $h = @file_get_contents($url);
    $mtG += microtime(1);
    if (false === $h) {
        outOLEnd("bad get in $url from", $u2f[$url], error_get_last());
        continue;
    }
    outLi(strlen($h), "chars", $http_response_header);
    $mtP -= microtime(1);
    $d = new DOMDocument();
    error_clear_last();
    @$d->loadHTML($h);
    $mtP += microtime(1);
    if (false === $d) {
        outOLEnd("bad html in $url from", $u2f[$url], ", html $h", error_get_last());
        continue;
    }
    $aL = $d->getElementsByTagName('a');
    outLi(count($aL), '<a...>');
    foreach ($aL as $a) {
        $href = $a->getAttribute('href');
        if (empty($href) )
            outLi("empty href # " . $d->saveHTML($a));
        elseif (! preg_match($uMsk, $href)) 
            outLi("skipping $href");
        elseif (isset($u2f[$href]))
            outLi("already $href");
        else {
            outLi("queued $href");
            $u2f[$href] = [$url];
            $que[] = $href;
        }
    }
    outOLEnd();
}
out("$wx pages,", sprintf('%9.3e ela, %9.3e get, %9.3e parse', microtime(1) - $mtS, $mtG, $mtP), ", queue", count($que));
outEnd(__FILE__);

?>