Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
127 lines
3.7 KiB
Python
127 lines
3.7 KiB
Python
"""Sitemap parser for URL discovery.
|
|
|
|
Fetches and parses XML sitemaps (including sitemap indexes) to discover
|
|
URLs for crawling. Falls back to common page paths if no sitemap exists.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from xml.etree import ElementTree
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# XML namespace used in sitemaps
|
|
_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
|
|
# Common page paths to try when no sitemap is available
|
|
_DEFAULT_PATHS = [
|
|
"/",
|
|
"/about",
|
|
"/contact",
|
|
"/privacy",
|
|
"/privacy-policy",
|
|
"/terms",
|
|
"/cookie-policy",
|
|
]
|
|
|
|
|
|
async def discover_urls(
|
|
domain: str,
|
|
*,
|
|
max_urls: int = 50,
|
|
timeout: float = 10.0,
|
|
) -> list[str]:
|
|
"""Discover URLs for a domain via sitemap or fallback paths.
|
|
|
|
Attempts to fetch /sitemap.xml first. If that fails, tries
|
|
/robots.txt for a Sitemap directive. Falls back to default paths.
|
|
"""
|
|
base = f"https://{domain}"
|
|
urls: list[str] = []
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=timeout,
|
|
follow_redirects=True,
|
|
verify=False, # noqa: S501 — scanning may target sites with self-signed certs
|
|
) as client:
|
|
# Try sitemap.xml
|
|
sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
|
|
if sitemap_urls:
|
|
return sitemap_urls[:max_urls]
|
|
|
|
# Try robots.txt for Sitemap directive
|
|
sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
|
|
if sitemap_url:
|
|
sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
|
|
if sitemap_urls:
|
|
return sitemap_urls[:max_urls]
|
|
|
|
# Fallback to default paths
|
|
urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
|
|
return urls[:max_urls]
|
|
|
|
|
|
async def _fetch_sitemap(
|
|
client: httpx.AsyncClient,
|
|
url: str,
|
|
max_urls: int,
|
|
) -> list[str]:
|
|
"""Fetch and parse an XML sitemap. Handles sitemap indexes."""
|
|
try:
|
|
resp = await client.get(url)
|
|
if resp.status_code != 200:
|
|
return []
|
|
|
|
# SPAs with catch-all nginx/Caddy rules return 200 + text/html
|
|
# for /sitemap.xml. Don't try to parse HTML as XML.
|
|
content_type = resp.headers.get("content-type", "")
|
|
if "html" in content_type and "xml" not in content_type:
|
|
logger.debug("Sitemap %s returned HTML, skipping", url)
|
|
return []
|
|
|
|
root = ElementTree.fromstring(resp.text)
|
|
|
|
# Check if it's a sitemap index
|
|
sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
|
|
if sitemaps:
|
|
urls: list[str] = []
|
|
for sm_loc in sitemaps:
|
|
if sm_loc.text:
|
|
child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
|
|
urls.extend(child_urls)
|
|
if len(urls) >= max_urls:
|
|
break
|
|
return urls[:max_urls]
|
|
|
|
# Regular sitemap — extract <loc> URLs
|
|
locs = root.findall("sm:url/sm:loc", _NS)
|
|
return [loc.text for loc in locs if loc.text][:max_urls]
|
|
|
|
except Exception as exc:
|
|
logger.debug("Failed to fetch sitemap %s: %s", url, exc)
|
|
return []
|
|
|
|
|
|
async def _find_sitemap_in_robots(
|
|
client: httpx.AsyncClient,
|
|
robots_url: str,
|
|
) -> str | None:
|
|
"""Look for a Sitemap directive in robots.txt."""
|
|
try:
|
|
resp = await client.get(robots_url)
|
|
if resp.status_code != 200:
|
|
return None
|
|
|
|
for line in resp.text.splitlines():
|
|
stripped = line.strip()
|
|
if stripped.lower().startswith("sitemap:"):
|
|
return stripped.split(":", 1)[1].strip()
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|