php/urlHref.php
<?php
/*******************************************************************************
urlHref.php: read url, analyze HTML for <a href
loop
read an url (i.e. stream) from queue
analyze it as http: DOMDocument::loadHTML
find all <a href=
queue these href-url,
if they conform (e.g. same host, no query ....) and not already queued
report stats and times used
problems:
could not get Connection: keep-alive (not a stream option?)
we get HTTP/1.1 404 Not Found, and not create page as in firefox
*******************************************************************************/
require_once('env.php');
outBegin(basename(__file__));
$mtS = microtime(1);
$mtG = $mtP = 0;
$u0 = 'https://localhost';
$u0 = 'https://www.wlkl.ch';
$uMsk = "%^$u0/(?!.*RecentChan)[^?]+$%";
$que = ["$u0/index.php"];
$u2f = [$que[0] => 0];
# $ctx = stream_context_set_default(['https' => ['method'=>'GET', 'header' => ['Connection: keep-alive']]]);
# $a2strLevel=4;
# out('context', stream_context_get_params($ctx));
for ($wx=0; $wx < count($que) and $wx < 5; $wx++) {
$url = $que[$wx];
out($url, 'from', $u2f[$url] ?? '-');
outOL();
error_clear_last();
$mtG -= microtime(1);
$h = @file_get_contents($url);
$mtG += microtime(1);
if (false === $h) {
outOLEnd("bad get in $url from", $u2f[$url], error_get_last());
continue;
}
outLi(strlen($h), "chars", $http_response_header);
$mtP -= microtime(1);
$d = new DOMDocument();
error_clear_last();
@$d->loadHTML($h);
$mtP += microtime(1);
if (false === $d) {
outOLEnd("bad html in $url from", $u2f[$url], ", html $h", error_get_last());
continue;
}
$aL = $d->getElementsByTagName('a');
outLi(count($aL), '<a...>');
foreach ($aL as $a) {
$href = $a->getAttribute('href');
if (empty($href) )
outLi("empty href # " . $d->saveHTML($a));
elseif (! preg_match($uMsk, $href))
outLi("skipping $href");
elseif (isset($u2f[$href]))
outLi("already $href");
else {
outLi("queued $href");
$u2f[$href] = [$url];
$que[] = $href;
}
}
outOLEnd();
}
out("$wx pages,", sprintf('%9.3e ela, %9.3e get, %9.3e parse', microtime(1) - $mtS, $mtG, $mtP), ", queue", count($que));
outEnd(__FILE__);
?>