python/sitemap.py

"""

"""
from io import StringIO
import os

def err(*m):
    print('\n*****\nerror in', __file__ + ':', *m)
    x = 1 / 0

iN = '/wkData/tmp/sitemap.html'
oN = 'sitemap.txt'
i = open(iN, "r", encoding="utf-8")
o = open(oN, 'w', encoding="utf-8")
while True:
    l = i.readline()
    if '<main' in l:
        break
    if l == '':
        err('<main not found in', iN)
print('found main', l)
while True:
    l = i.readline()
    if '</main' in l:
        break
    cx = l.find('href=')
    if cx > 0:
        j = l[cx+5:]
        if j[0] != '"':
            err('href without "', j, 'from', l)
        cx = j.find('"', 1)
        if cx < 1:
            err('href without ending "', cx, j, 'from', l)
        j = j[1: cx]
        if '://' not in j:
            print('ignoring', j, 'from', l)
        else:
            # print('href', j, 'from', l)
            o.writelines(j+os.linesep)
    if l == '':
        err('</main not found in', i)
print('found end main', l)
o.close()