feat: initial public release

ConsentOS — a privacy-first cookie consent management platform.

Self-hosted, source-available alternative to OneTrust, Cookiebot, and
CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google
Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant
architecture with role-based access, configuration cascade
(system → org → group → site → region), dark-pattern detection in
the scanner, and a tamper-evident consent record audit trail.

This is the initial public release. Prior development history is
retained internally.

See README.md for the feature list, architecture overview, and
quick-start instructions. Licensed under the Elastic Licence 2.0 —
self-host freely; do not resell as a managed service.
This commit is contained in:
James Cottrill
2026-04-13 14:20:15 +00:00
commit fbf26453f2
341 changed files with 62807 additions and 0 deletions

119
apps/scanner/src/sitemap.py Normal file
View File

@@ -0,0 +1,119 @@
"""Sitemap parser for URL discovery.
Fetches and parses XML sitemaps (including sitemap indexes) to discover
URLs for crawling. Falls back to common page paths if no sitemap exists.
"""
from __future__ import annotations
import logging
from xml.etree import ElementTree
import httpx
logger = logging.getLogger(__name__)
# XML namespace used in sitemaps
_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Common page paths to try when no sitemap is available
_DEFAULT_PATHS = [
"/",
"/about",
"/contact",
"/privacy",
"/privacy-policy",
"/terms",
"/cookie-policy",
]
async def discover_urls(
domain: str,
*,
max_urls: int = 50,
timeout: float = 10.0,
) -> list[str]:
"""Discover URLs for a domain via sitemap or fallback paths.
Attempts to fetch /sitemap.xml first. If that fails, tries
/robots.txt for a Sitemap directive. Falls back to default paths.
"""
base = f"https://{domain}"
urls: list[str] = []
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
verify=False, # noqa: S501 — scanning may target sites with self-signed certs
) as client:
# Try sitemap.xml
sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
if sitemap_urls:
return sitemap_urls[:max_urls]
# Try robots.txt for Sitemap directive
sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
if sitemap_url:
sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
if sitemap_urls:
return sitemap_urls[:max_urls]
# Fallback to default paths
urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
return urls[:max_urls]
async def _fetch_sitemap(
client: httpx.AsyncClient,
url: str,
max_urls: int,
) -> list[str]:
"""Fetch and parse an XML sitemap. Handles sitemap indexes."""
try:
resp = await client.get(url)
if resp.status_code != 200:
return []
root = ElementTree.fromstring(resp.text)
# Check if it's a sitemap index
sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
if sitemaps:
urls: list[str] = []
for sm_loc in sitemaps:
if sm_loc.text:
child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
urls.extend(child_urls)
if len(urls) >= max_urls:
break
return urls[:max_urls]
# Regular sitemap — extract <loc> URLs
locs = root.findall("sm:url/sm:loc", _NS)
return [loc.text for loc in locs if loc.text][:max_urls]
except Exception as exc:
logger.debug("Failed to fetch sitemap %s: %s", url, exc)
return []
async def _find_sitemap_in_robots(
client: httpx.AsyncClient,
robots_url: str,
) -> str | None:
"""Look for a Sitemap directive in robots.txt."""
try:
resp = await client.get(robots_url)
if resp.status_code != 200:
return None
for line in resp.text.splitlines():
stripped = line.strip()
if stripped.lower().startswith("sitemap:"):
return stripped.split(":", 1)[1].strip()
except Exception:
pass
return None