Files
consentos/apps/scanner/src/sitemap.py
James Cottrill e0f1dd43e8 fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)
Scanner fixes:
- Remove conflicting ``path`` from consent pre-seed cookie (Playwright
  rejects cookies with both ``url`` and ``path``).
- Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable
  cookie capture.
- Check sitemap Content-Type to skip SPA HTML fallbacks.
- Propagate ``auto_category`` from scan results to the cookies table
  during sync (was silently dropped).
- Add ``_gcl_ls`` to the Open Cookie Database CSV.
- Classify ``_consentos_*`` cookies as necessary directly in the
  classification engine.
- Add ``seed_known_cookies`` to the bootstrap init container command.

Admin UI:
- Add scan schedule control to the Scans tab — preset options
  (disabled/daily/weekly/fortnightly/monthly) plus custom cron input.
  Saves ``scan_schedule_cron`` on the site config.
2026-04-18 20:14:32 +01:00

127 lines
3.7 KiB
Python

"""Sitemap parser for URL discovery.
Fetches and parses XML sitemaps (including sitemap indexes) to discover
URLs for crawling. Falls back to common page paths if no sitemap exists.
"""
from __future__ import annotations
import logging
from xml.etree import ElementTree
import httpx
logger = logging.getLogger(__name__)
# XML namespace used in sitemaps
_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Common page paths to try when no sitemap is available
_DEFAULT_PATHS = [
"/",
"/about",
"/contact",
"/privacy",
"/privacy-policy",
"/terms",
"/cookie-policy",
]
async def discover_urls(
domain: str,
*,
max_urls: int = 50,
timeout: float = 10.0,
) -> list[str]:
"""Discover URLs for a domain via sitemap or fallback paths.
Attempts to fetch /sitemap.xml first. If that fails, tries
/robots.txt for a Sitemap directive. Falls back to default paths.
"""
base = f"https://{domain}"
urls: list[str] = []
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
verify=False, # noqa: S501 — scanning may target sites with self-signed certs
) as client:
# Try sitemap.xml
sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
if sitemap_urls:
return sitemap_urls[:max_urls]
# Try robots.txt for Sitemap directive
sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
if sitemap_url:
sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
if sitemap_urls:
return sitemap_urls[:max_urls]
# Fallback to default paths
urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
return urls[:max_urls]
async def _fetch_sitemap(
client: httpx.AsyncClient,
url: str,
max_urls: int,
) -> list[str]:
"""Fetch and parse an XML sitemap. Handles sitemap indexes."""
try:
resp = await client.get(url)
if resp.status_code != 200:
return []
# SPAs with catch-all nginx/Caddy rules return 200 + text/html
# for /sitemap.xml. Don't try to parse HTML as XML.
content_type = resp.headers.get("content-type", "")
if "html" in content_type and "xml" not in content_type:
logger.debug("Sitemap %s returned HTML, skipping", url)
return []
root = ElementTree.fromstring(resp.text)
# Check if it's a sitemap index
sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
if sitemaps:
urls: list[str] = []
for sm_loc in sitemaps:
if sm_loc.text:
child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
urls.extend(child_urls)
if len(urls) >= max_urls:
break
return urls[:max_urls]
# Regular sitemap — extract <loc> URLs
locs = root.findall("sm:url/sm:loc", _NS)
return [loc.text for loc in locs if loc.text][:max_urls]
except Exception as exc:
logger.debug("Failed to fetch sitemap %s: %s", url, exc)
return []
async def _find_sitemap_in_robots(
client: httpx.AsyncClient,
robots_url: str,
) -> str | None:
"""Look for a Sitemap directive in robots.txt."""
try:
resp = await client.get(robots_url)
if resp.status_code != 200:
return None
for line in resp.text.splitlines():
stripped = line.strip()
if stripped.lower().startswith("sitemap:"):
return stripped.split(":", 1)[1].strip()
except Exception:
pass
return None