feat: initial public release
ConsentOS — a privacy-first cookie consent management platform. Self-hosted, source-available alternative to OneTrust, Cookiebot, and CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant architecture with role-based access, configuration cascade (system → org → group → site → region), dark-pattern detection in the scanner, and a tamper-evident consent record audit trail. This is the initial public release. Prior development history is retained internally. See README.md for the feature list, architecture overview, and quick-start instructions. Licensed under the Elastic Licence 2.0 — self-host freely; do not resell as a managed service.
This commit is contained in:
119
apps/scanner/src/sitemap.py
Normal file
119
apps/scanner/src/sitemap.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Sitemap parser for URL discovery.
|
||||
|
||||
Fetches and parses XML sitemaps (including sitemap indexes) to discover
|
||||
URLs for crawling. Falls back to common page paths if no sitemap exists.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# XML namespace used in sitemaps
|
||||
_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||
|
||||
# Common page paths to try when no sitemap is available
|
||||
_DEFAULT_PATHS = [
|
||||
"/",
|
||||
"/about",
|
||||
"/contact",
|
||||
"/privacy",
|
||||
"/privacy-policy",
|
||||
"/terms",
|
||||
"/cookie-policy",
|
||||
]
|
||||
|
||||
|
||||
async def discover_urls(
|
||||
domain: str,
|
||||
*,
|
||||
max_urls: int = 50,
|
||||
timeout: float = 10.0,
|
||||
) -> list[str]:
|
||||
"""Discover URLs for a domain via sitemap or fallback paths.
|
||||
|
||||
Attempts to fetch /sitemap.xml first. If that fails, tries
|
||||
/robots.txt for a Sitemap directive. Falls back to default paths.
|
||||
"""
|
||||
base = f"https://{domain}"
|
||||
urls: list[str] = []
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
verify=False, # noqa: S501 — scanning may target sites with self-signed certs
|
||||
) as client:
|
||||
# Try sitemap.xml
|
||||
sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
|
||||
if sitemap_urls:
|
||||
return sitemap_urls[:max_urls]
|
||||
|
||||
# Try robots.txt for Sitemap directive
|
||||
sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
|
||||
if sitemap_url:
|
||||
sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
|
||||
if sitemap_urls:
|
||||
return sitemap_urls[:max_urls]
|
||||
|
||||
# Fallback to default paths
|
||||
urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
|
||||
return urls[:max_urls]
|
||||
|
||||
|
||||
async def _fetch_sitemap(
|
||||
client: httpx.AsyncClient,
|
||||
url: str,
|
||||
max_urls: int,
|
||||
) -> list[str]:
|
||||
"""Fetch and parse an XML sitemap. Handles sitemap indexes."""
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
root = ElementTree.fromstring(resp.text)
|
||||
|
||||
# Check if it's a sitemap index
|
||||
sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
|
||||
if sitemaps:
|
||||
urls: list[str] = []
|
||||
for sm_loc in sitemaps:
|
||||
if sm_loc.text:
|
||||
child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
|
||||
urls.extend(child_urls)
|
||||
if len(urls) >= max_urls:
|
||||
break
|
||||
return urls[:max_urls]
|
||||
|
||||
# Regular sitemap — extract <loc> URLs
|
||||
locs = root.findall("sm:url/sm:loc", _NS)
|
||||
return [loc.text for loc in locs if loc.text][:max_urls]
|
||||
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to fetch sitemap %s: %s", url, exc)
|
||||
return []
|
||||
|
||||
|
||||
async def _find_sitemap_in_robots(
|
||||
client: httpx.AsyncClient,
|
||||
robots_url: str,
|
||||
) -> str | None:
|
||||
"""Look for a Sitemap directive in robots.txt."""
|
||||
try:
|
||||
resp = await client.get(robots_url)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
for line in resp.text.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.lower().startswith("sitemap:"):
|
||||
return stripped.split(":", 1)[1].strip()
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
Reference in New Issue
Block a user