feat: initial public release

ConsentOS — a privacy-first cookie consent management platform.

Self-hosted, source-available alternative to OneTrust, Cookiebot, and
CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google
Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant
architecture with role-based access, configuration cascade
(system → org → group → site → region), dark-pattern detection in
the scanner, and a tamper-evident consent record audit trail.

This is the initial public release. Prior development history is
retained internally.

See README.md for the feature list, architecture overview, and
quick-start instructions. Licensed under the Elastic Licence 2.0 —
self-host freely; do not resell as a managed service.
This commit is contained in:
James Cottrill
2026-04-13 14:20:15 +00:00
commit fbf26453f2
341 changed files with 62807 additions and 0 deletions

View File

View File

@@ -0,0 +1,107 @@
"""Cookie classification based on known patterns.
Matches discovered cookies against a database of known cookie patterns
to auto-categorise them (analytics, marketing, functional, etc.).
"""
from __future__ import annotations
import re
from dataclasses import dataclass
@dataclass
class KnownPattern:
"""A known cookie pattern for classification."""
name_pattern: str
domain_pattern: str
category: str
vendor: str | None = None
is_regex: bool = False
@dataclass
class ClassificationResult:
"""Result of classifying a cookie."""
category: str | None
vendor: str | None = None
match_source: str = "unmatched" # exact | wildcard | regex | unmatched
def classify_cookie(
name: str,
domain: str,
patterns: list[KnownPattern],
) -> ClassificationResult:
"""Classify a cookie by matching against known patterns.
Matching priority:
1. Exact name match
2. Wildcard match (patterns containing *)
3. Regex match (patterns flagged as regex)
"""
for pattern in patterns:
if pattern.is_regex:
continue # Skip regex in first pass
if "*" in pattern.name_pattern:
# Wildcard match
regex = pattern.name_pattern.replace(".", r"\.").replace("*", ".*")
if re.match(f"^{regex}$", name, re.IGNORECASE):
if _domain_matches(domain, pattern.domain_pattern):
return ClassificationResult(
category=pattern.category,
vendor=pattern.vendor,
match_source="wildcard",
)
elif pattern.name_pattern == name:
# Exact match
if _domain_matches(domain, pattern.domain_pattern):
return ClassificationResult(
category=pattern.category,
vendor=pattern.vendor,
match_source="exact",
)
# Regex pass
for pattern in patterns:
if not pattern.is_regex:
continue
try:
if re.match(pattern.name_pattern, name, re.IGNORECASE):
if _domain_matches(domain, pattern.domain_pattern):
return ClassificationResult(
category=pattern.category,
vendor=pattern.vendor,
match_source="regex",
)
except re.error:
continue
return ClassificationResult(category=None, match_source="unmatched")
def _domain_matches(actual: str, pattern: str) -> bool:
"""Check if a domain matches a pattern.
Patterns can be:
- "*" — matches any domain
- ".example.com" — matches example.com and *.example.com
- "example.com" — exact match
"""
if pattern == "*":
return True
actual = actual.lower().lstrip(".")
pattern = pattern.lower().lstrip(".")
if actual == pattern:
return True
# Subdomain match: actual "sub.example.com" matches pattern "example.com"
if actual.endswith(f".{pattern}"):
return True
return False

View File

@@ -0,0 +1,280 @@
"""Consent signal validation — Playwright-based runtime checks.
Validates that consent signals (GCM, TCF, GPP) work correctly at runtime
by checking pre-consent, post-accept, and post-reject states.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from playwright.async_api import BrowserContext, Page
logger = logging.getLogger(__name__)
# Known tracker domains for pixel-fire detection
KNOWN_TRACKER_DOMAINS = frozenset(
{
"google-analytics.com",
"googletagmanager.com",
"doubleclick.net",
"facebook.net",
"facebook.com",
"connect.facebook.net",
"analytics.tiktok.com",
"snap.licdn.com",
"bat.bing.com",
"clarity.ms",
"hotjar.com",
"mouseflow.com",
"cdn.segment.com",
"cdn.mxpnl.com",
"plausible.io",
"px.ads.linkedin.com",
}
)
@dataclass
class ConsentSignalState:
"""Captured consent signal state from the page."""
gcm_state: dict | None = None
tcf_data: dict | None = None
gpp_data: dict | None = None
@dataclass
class ValidationIssue:
"""A single consent validation issue."""
check: str
severity: str # critical, warning, info
message: str
recommendation: str
details: dict = field(default_factory=dict)
@dataclass
class ValidationResult:
"""Result of consent signal validation for a page."""
url: str
pre_consent_issues: list[ValidationIssue] = field(default_factory=list)
post_accept_issues: list[ValidationIssue] = field(default_factory=list)
post_reject_issues: list[ValidationIssue] = field(default_factory=list)
error: str | None = None
@property
def all_issues(self) -> list[ValidationIssue]:
return self.pre_consent_issues + self.post_accept_issues + self.post_reject_issues
@property
def has_issues(self) -> bool:
return bool(self.all_issues)
async def _get_consent_signals(page: Page) -> ConsentSignalState:
"""Extract current consent signal state from the page."""
state = ConsentSignalState()
# Read GCM state
try:
gcm = await page.evaluate("""() => {
try {
if (window.dataLayer) {
const consentEvents = window.dataLayer.filter(
e => e[0] === 'consent' || (e.event && e.event.includes('consent'))
);
return { dataLayer: consentEvents, available: true };
}
return { available: false };
} catch (e) { return { error: e.message }; }
}""")
state.gcm_state = gcm
except Exception:
pass
# Read TCF state
try:
tcf = await page.evaluate("""() => {
return new Promise((resolve) => {
if (typeof window.__tcfapi === 'function') {
window.__tcfapi('getTCData', 2, (data, success) => {
resolve({ available: true, success, data: data || null });
});
} else {
resolve({ available: false });
}
});
}""")
state.tcf_data = tcf
except Exception:
pass
# Read GPP state
try:
gpp = await page.evaluate("""() => {
return new Promise((resolve) => {
if (typeof window.__gpp === 'function') {
window.__gpp('getGPPData', (data, success) => {
resolve({ available: true, success, data: data || null });
});
} else {
resolve({ available: false });
}
});
}""")
state.gpp_data = gpp
except Exception:
pass
return state
async def _get_cookies_from_context(context: BrowserContext) -> list[dict]:
"""Get all cookies from the browser context."""
return await context.cookies()
def _is_tracker_request(url: str) -> bool:
"""Check if a URL belongs to a known tracker domain."""
for domain in KNOWN_TRACKER_DOMAINS:
if domain in url:
return True
return False
async def validate_pre_consent(
page: Page,
context: BrowserContext,
essential_cookie_names: set[str],
tracker_requests: list[str],
) -> list[ValidationIssue]:
"""Validate that no non-essential activity occurs before consent."""
issues: list[ValidationIssue] = []
# Check cookies — only essential should be set
cookies = await _get_cookies_from_context(context)
non_essential = [c for c in cookies if c["name"] not in essential_cookie_names]
if non_essential:
names = [c["name"] for c in non_essential]
issues.append(
ValidationIssue(
check="pre_consent_cookies",
severity="critical",
message=(
f"{len(non_essential)} non-essential cookie(s) set before consent: "
f"{', '.join(names[:5])}"
),
recommendation=(
"Ensure all non-essential cookies are blocked until consent is given."
),
details={"cookies": names},
)
)
# Check tracker requests
tracker_hits = [url for url in tracker_requests if _is_tracker_request(url)]
if tracker_hits:
issues.append(
ValidationIssue(
check="pre_consent_trackers",
severity="critical",
message=f"{len(tracker_hits)} tracking request(s) fired before consent.",
recommendation="Block all tracking scripts until the user grants consent.",
details={"tracker_urls": tracker_hits[:10]},
)
)
# Check GCM defaults
signals = await _get_consent_signals(page)
if signals.gcm_state and signals.gcm_state.get("available"):
# GCM should show denied for non-essential types
pass # GCM state captured for reporting
# Check TCF — no purpose consents should be active
if signals.tcf_data and signals.tcf_data.get("available"):
tcf_data = signals.tcf_data.get("data") or {}
purpose_consents = tcf_data.get("purpose", {}).get("consents", {})
granted_purposes = [k for k, v in purpose_consents.items() if v]
if granted_purposes:
issues.append(
ValidationIssue(
check="pre_consent_tcf",
severity="critical",
message=f"TCF purpose consents active before user action: {granted_purposes}",
recommendation="TCF should report no purpose consents until user grants them.",
details={"granted_purposes": granted_purposes},
)
)
return issues
async def validate_post_accept(
page: Page,
context: BrowserContext,
) -> list[ValidationIssue]:
"""Validate consent signals after Accept All is clicked."""
issues: list[ValidationIssue] = []
signals = await _get_consent_signals(page)
# Check TCF — purposes should now be consented
if signals.tcf_data and signals.tcf_data.get("available"):
if not signals.tcf_data.get("success"):
issues.append(
ValidationIssue(
check="post_accept_tcf",
severity="warning",
message="TCF getTCData returned unsuccessful after Accept All.",
recommendation=("Verify TCF API returns valid TC data after consent."),
)
)
return issues
async def validate_post_reject(
page: Page,
context: BrowserContext,
essential_cookie_names: set[str],
tracker_requests: list[str],
) -> list[ValidationIssue]:
"""Validate that rejection is respected — no tracking after reject."""
issues: list[ValidationIssue] = []
# Check cookies after reject
cookies = await _get_cookies_from_context(context)
non_essential = [c for c in cookies if c["name"] not in essential_cookie_names]
if non_essential:
names = [c["name"] for c in non_essential]
issues.append(
ValidationIssue(
check="post_reject_cookies",
severity="critical",
message=(
f"{len(non_essential)} non-essential cookie(s) remain after rejection: "
f"{', '.join(names[:5])}"
),
recommendation="Ensure all non-essential cookies are removed when user rejects.",
details={"cookies": names},
)
)
# Check tracker requests after reject
tracker_hits = [url for url in tracker_requests if _is_tracker_request(url)]
if tracker_hits:
issues.append(
ValidationIssue(
check="post_reject_trackers",
severity="critical",
message=f"{len(tracker_hits)} tracking request(s) fired after rejection.",
recommendation="Ensure tracking scripts respect rejection and do not fire.",
details={"tracker_urls": tracker_hits[:10]},
)
)
return issues

335
apps/scanner/src/crawler.py Normal file
View File

@@ -0,0 +1,335 @@
"""Playwright-based headless browser cookie crawler.
For each URL: launches headless Chromium, clears cookies, navigates,
waits for network idle, enumerates document.cookie / localStorage /
sessionStorage, captures Set-Cookie headers from network requests,
and attributes cookies to source scripts via the request chain.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from urllib.parse import urlparse
from playwright.async_api import (
BrowserContext,
Page,
Request,
Response,
async_playwright,
)
logger = logging.getLogger(__name__)
# Realistic Chrome UA so sites don't block the crawler as a bot.
_DEFAULT_USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
@dataclass
class DiscoveredCookie:
"""A cookie or storage item found during a crawl."""
name: str
domain: str
storage_type: str = "cookie" # cookie | local_storage | session_storage
path: str | None = None
expires: float | None = None
http_only: bool | None = None
secure: bool | None = None
same_site: str | None = None
value_length: int = 0
script_source: str | None = None
page_url: str = ""
initiator_chain: list[str] = field(default_factory=list)
@dataclass
class CrawlResult:
"""Result of crawling a single page."""
url: str
cookies: list[DiscoveredCookie] = field(default_factory=list)
error: str | None = None
@dataclass
class SiteCrawlResult:
"""Aggregated result of crawling all pages on a site."""
domain: str
pages: list[CrawlResult] = field(default_factory=list)
total_cookies_found: int = 0
@property
def unique_cookies(self) -> list[DiscoveredCookie]:
"""Deduplicate cookies across pages by (name, domain, storage_type)."""
seen: dict[tuple[str, str, str], DiscoveredCookie] = {}
for page in self.pages:
for cookie in page.cookies:
key = (cookie.name, cookie.domain, cookie.storage_type)
if key not in seen:
seen[key] = cookie
return list(seen.values())
@dataclass
class ProxyConfig:
"""Proxy configuration for geo-located scanning."""
server: str # e.g. "http://proxy-eu.example.com:8080"
username: str | None = None
password: str | None = None
class CookieCrawler:
"""Crawls a site using Playwright to discover cookies and storage items."""
def __init__(
self,
*,
headless: bool = True,
timeout_ms: int = 30_000,
user_agent: str = _DEFAULT_USER_AGENT,
proxy: ProxyConfig | None = None,
) -> None:
self._headless = headless
self._timeout_ms = timeout_ms
self._user_agent = user_agent
self._proxy = proxy
async def crawl_site(
self,
urls: list[str],
*,
max_pages: int = 50,
) -> SiteCrawlResult:
"""Crawl multiple URLs and aggregate cookie discoveries."""
if not urls:
return SiteCrawlResult(domain="")
domain = urlparse(urls[0]).hostname or ""
result = SiteCrawlResult(domain=domain)
async with async_playwright() as pw:
launch_kwargs: dict = {"headless": self._headless}
if self._proxy:
proxy_opts: dict = {"server": self._proxy.server}
if self._proxy.username:
proxy_opts["username"] = self._proxy.username
if self._proxy.password:
proxy_opts["password"] = self._proxy.password
launch_kwargs["proxy"] = proxy_opts
browser = await pw.chromium.launch(**launch_kwargs)
try:
for url in urls[:max_pages]:
page_result = await self._crawl_page(browser, url)
result.pages.append(page_result)
result.total_cookies_found += len(page_result.cookies)
finally:
await browser.close()
return result
async def _crawl_page(
self,
browser: Browser, # noqa: F821
url: str,
) -> CrawlResult:
"""Crawl a single page and discover cookies."""
result = CrawlResult(url=url)
script_cookies: dict[str, str] = {} # cookie name → script URL
initiator_map: dict[str, str] = {} # request URL → initiating URL
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
context: BrowserContext | None = None
try:
context = await browser.new_context(
user_agent=self._user_agent,
ignore_https_errors=True,
)
# Clear all cookies before visiting
await context.clear_cookies()
page: Page = await context.new_page()
# Track request initiator chains via frame URL and redirect chains
def _on_request(request: Request) -> None:
try:
req_url = request.url
# Follow redirect chain to find the original initiator
redirected = request.redirected_from
if redirected:
initiator_map[req_url] = redirected.url
else:
# Use the frame URL as the parent initiator
frame_url = request.frame.url if request.frame else ""
if frame_url and frame_url != req_url:
initiator_map[req_url] = frame_url
except Exception:
pass # Non-critical — request introspection may fail
page.on("request", _on_request)
# Track Set-Cookie headers from responses
async def _on_response(response: Response) -> None:
try:
headers = await response.all_headers()
set_cookie = headers.get("set-cookie", "")
if set_cookie:
# Attribute cookie to the initiating script
request: Request = response.request
initiator = _get_script_initiator(request)
# Build the initiator chain for this request
chain = _build_initiator_chain(request.url, initiator_map)
for cookie_str in set_cookie.split("\n"):
name = cookie_str.split("=")[0].strip()
if name:
if initiator:
script_cookies[name] = initiator
initiator_chains[name] = chain
except Exception:
pass # Non-critical — response may have been aborted
page.on("response", _on_response)
# Navigate
await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms)
# Allow additional time for scripts to set cookies after DOM load.
await page.wait_for_timeout(3000)
# Enumerate browser cookies via CDP
cdp_cookies = await context.cookies()
for c in cdp_cookies:
result.cookies.append(
DiscoveredCookie(
name=c["name"],
domain=c["domain"],
storage_type="cookie",
path=c.get("path"),
expires=c.get("expires"),
http_only=c.get("httpOnly"),
secure=c.get("secure"),
same_site=c.get("sameSite"),
value_length=len(c.get("value", "")),
script_source=script_cookies.get(c["name"]),
page_url=url,
initiator_chain=initiator_chains.get(c["name"], []),
)
)
# Enumerate localStorage
ls_items = await page.evaluate("""() => {
const items = [];
try {
for (let i = 0; i < localStorage.length; i++) {
const key = localStorage.key(i);
if (key) {
items.push({
name: key,
valueLength: (localStorage.getItem(key) || '').length,
});
}
}
} catch (e) {}
return items;
}""")
hostname = urlparse(url).hostname or ""
for item in ls_items:
result.cookies.append(
DiscoveredCookie(
name=item["name"],
domain=hostname,
storage_type="local_storage",
value_length=item["valueLength"],
page_url=url,
)
)
# Enumerate sessionStorage
ss_items = await page.evaluate("""() => {
const items = [];
try {
for (let i = 0; i < sessionStorage.length; i++) {
const key = sessionStorage.key(i);
if (key) {
items.push({
name: key,
valueLength: (sessionStorage.getItem(key) || '').length,
});
}
}
} catch (e) {}
return items;
}""")
for item in ss_items:
result.cookies.append(
DiscoveredCookie(
name=item["name"],
domain=hostname,
storage_type="session_storage",
value_length=item["valueLength"],
page_url=url,
)
)
except Exception as exc:
result.error = str(exc)
logger.warning("Failed to crawl %s: %s", url, exc)
finally:
if context:
await context.close()
return result
def _get_script_initiator(request: Request) -> str | None:
"""Walk the request chain to find the originating script URL.
Returns a single script URL for backwards compatibility. For the full
initiator path, use :func:`_build_initiator_chain` instead.
"""
seen: set[str] = set()
current = request
while current:
url = current.url
if url in seen:
break
seen.add(url)
if url.endswith(".js") or "javascript" in (current.resource_type or ""):
return url
redirected = current.redirected_from
if redirected:
current = redirected
else:
break
return None
def _build_initiator_chain(
url: str,
initiator_map: dict[str, str],
max_depth: int = 20,
) -> list[str]:
"""Build the full initiator chain from a URL back to the root.
Walks the initiator map from *url* towards the top-level page,
producing a list ordered root-first (i.e. the page URL at index 0
and the leaf request URL at the end).
"""
chain = [url]
seen: set[str] = {url}
current = url
for _ in range(max_depth):
parent = initiator_map.get(current, "")
if not parent or parent in seen:
break
chain.append(parent)
seen.add(parent)
current = parent
chain.reverse() # Root first
return chain

View File

@@ -0,0 +1,348 @@
"""Dark pattern detection — CSS and DOM analysis of consent banners.
Detects manipulative UI patterns in cookie consent banners:
- Unequal button prominence (Accept bigger/brighter than Reject)
- Pre-ticked category checkboxes
- Missing first-layer Reject button (CNIL violation)
- Cookie walls (blocking page content)
- Dismiss-on-scroll (not valid consent under GDPR)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from playwright.async_api import Page
logger = logging.getLogger(__name__)
@dataclass
class DarkPatternIssue:
"""A detected dark pattern in the consent banner."""
pattern: str
severity: str # critical, warning, info
message: str
recommendation: str
details: dict = field(default_factory=dict)
@dataclass
class DarkPatternResult:
"""Result of dark pattern analysis."""
url: str
issues: list[DarkPatternIssue] = field(default_factory=list)
banner_found: bool = False
error: str | None = None
# Common selectors for consent banner elements
BANNER_SELECTORS = [
"[id*='cookie']",
"[id*='consent']",
"[class*='cookie']",
"[class*='consent']",
"[id*='cmp']",
"[class*='cmp']",
"[role='dialog'][aria-label*='cookie' i]",
"[role='dialog'][aria-label*='consent' i]",
]
ACCEPT_BUTTON_SELECTORS = [
"button:has-text('Accept')",
"button:has-text('Accept All')",
"button:has-text('Allow')",
"button:has-text('Allow All')",
"button:has-text('I Agree')",
"button:has-text('OK')",
"button:has-text('Got it')",
"[data-action='accept']",
"[id*='accept']",
]
REJECT_BUTTON_SELECTORS = [
"button:has-text('Reject')",
"button:has-text('Reject All')",
"button:has-text('Decline')",
"button:has-text('Deny')",
"button:has-text('Refuse')",
"button:has-text('Tout refuser')",
"[data-action='reject']",
"[id*='reject']",
]
async def _find_banner(page: Page) -> bool:
"""Check if a consent banner is visible on the page."""
for selector in BANNER_SELECTORS:
try:
elements = await page.query_selector_all(selector)
for el in elements:
if await el.is_visible():
return True
except Exception:
continue
return False
async def _find_button(page: Page, selectors: list[str]) -> dict | None:
"""Find a visible button matching one of the selectors, return its computed styles."""
for selector in selectors:
try:
elements = await page.query_selector_all(selector)
for el in elements:
if await el.is_visible():
styles = await el.evaluate("""(el) => {
const cs = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
return {
width: rect.width,
height: rect.height,
area: rect.width * rect.height,
backgroundColor: cs.backgroundColor,
color: cs.color,
fontSize: parseFloat(cs.fontSize),
fontWeight: cs.fontWeight,
padding: cs.padding,
text: el.textContent.trim(),
visible: true,
};
}""")
return styles
except Exception:
continue
return None
async def check_button_prominence(page: Page) -> list[DarkPatternIssue]:
"""Compare Accept and Reject button sizes and visual weight."""
issues: list[DarkPatternIssue] = []
accept_btn = await _find_button(page, ACCEPT_BUTTON_SELECTORS)
reject_btn = await _find_button(page, REJECT_BUTTON_SELECTORS)
if not accept_btn:
return issues # No accept button found — nothing to compare
if not reject_btn:
issues.append(
DarkPatternIssue(
pattern="missing_reject_button",
severity="critical",
message="No visible Reject/Decline button found on the first layer.",
recommendation=(
"Add a clearly visible 'Reject All' button on the first layer "
"of the consent banner, as required by GDPR and CNIL."
),
)
)
return issues
# Compare button areas
accept_area = accept_btn.get("area", 0)
reject_area = reject_btn.get("area", 0)
if reject_area > 0 and accept_area > 0:
ratio = accept_area / reject_area
if ratio > 1.5:
issues.append(
DarkPatternIssue(
pattern="unequal_button_size",
severity="warning",
message=(
f"Accept button is {ratio:.1f}x larger than Reject button. "
"Buttons should have equal prominence."
),
recommendation=(
"Make the Accept and Reject buttons the same size and visual weight."
),
details={
"accept_area": accept_area,
"reject_area": reject_area,
"ratio": round(ratio, 2),
},
)
)
# Compare font sizes
accept_font = accept_btn.get("fontSize", 0)
reject_font = reject_btn.get("fontSize", 0)
if reject_font > 0 and accept_font > reject_font * 1.3:
issues.append(
DarkPatternIssue(
pattern="unequal_font_size",
severity="warning",
message=(
f"Accept button font ({accept_font}px) is larger than "
f"Reject button font ({reject_font}px)."
),
recommendation="Use the same font size for both Accept and Reject buttons.",
details={
"accept_font_size": accept_font,
"reject_font_size": reject_font,
},
)
)
return issues
async def check_pre_ticked_boxes(page: Page) -> list[DarkPatternIssue]:
"""Check for pre-ticked non-essential category checkboxes."""
issues: list[DarkPatternIssue] = []
try:
pre_ticked = await page.evaluate("""() => {
const checkboxes = document.querySelectorAll(
'input[type="checkbox"][checked], input[type="checkbox"]:checked'
);
const results = [];
for (const cb of checkboxes) {
// Skip if it looks like an "essential" checkbox (often disabled)
if (cb.disabled) continue;
const label = cb.closest('label')?.textContent?.trim()
|| cb.getAttribute('aria-label')
|| cb.name
|| 'unknown';
// Skip checkboxes that appear to be for essential/necessary
const labelLower = label.toLowerCase();
if (labelLower.includes('essential') || labelLower.includes('necessary')
|| labelLower.includes('required') || labelLower.includes('strictly')) {
continue;
}
results.push({ name: cb.name || cb.id, label: label });
}
return results;
}""")
if pre_ticked:
labels = [pt["label"][:50] for pt in pre_ticked]
issues.append(
DarkPatternIssue(
pattern="pre_ticked_checkboxes",
severity="critical",
message=(
f"{len(pre_ticked)} non-essential category checkbox(es) are pre-ticked: "
f"{', '.join(labels[:3])}"
),
recommendation=(
"Non-essential category checkboxes must default to unchecked. "
"Pre-ticked boxes do not constitute valid consent under GDPR."
),
details={"checkboxes": pre_ticked},
)
)
except Exception as exc:
logger.debug("Pre-ticked checkbox check failed: %s", exc)
return issues
async def check_cookie_wall(page: Page) -> list[DarkPatternIssue]:
"""Check if a cookie wall blocks access to page content."""
issues: list[DarkPatternIssue] = []
try:
is_wall = await page.evaluate("""() => {
// Check for full-screen overlays blocking content
const overlays = document.querySelectorAll(
'[class*="overlay"], [class*="modal"], [class*="wall"]'
);
for (const overlay of overlays) {
const cs = window.getComputedStyle(overlay);
const rect = overlay.getBoundingClientRect();
// Full-viewport overlay with high z-index suggests a cookie wall
if (rect.width >= window.innerWidth * 0.9
&& rect.height >= window.innerHeight * 0.9
&& parseInt(cs.zIndex) > 100) {
return true;
}
}
// Check if body/main is hidden or has overflow hidden
const body = document.body;
const bodyStyle = window.getComputedStyle(body);
if (bodyStyle.overflow === 'hidden' && bodyStyle.position === 'fixed') {
return true;
}
return false;
}""")
if is_wall:
issues.append(
DarkPatternIssue(
pattern="cookie_wall",
severity="critical",
message="Cookie wall detected — page content appears blocked until consent.",
recommendation=(
"Remove the cookie wall. Users must be able to access the site "
"without being forced to consent to non-essential cookies."
),
)
)
except Exception as exc:
logger.debug("Cookie wall check failed: %s", exc)
return issues
async def check_scroll_dismissal(page: Page) -> list[DarkPatternIssue]:
"""Check if scrolling dismisses the consent banner (not valid consent)."""
issues: list[DarkPatternIssue] = []
try:
# Check if banner is visible before scroll
banner_visible_before = await _find_banner(page)
if not banner_visible_before:
return issues
# Scroll down
await page.evaluate("window.scrollBy(0, 500)")
await page.wait_for_timeout(1000)
# Check if banner disappeared
banner_visible_after = await _find_banner(page)
if banner_visible_before and not banner_visible_after:
issues.append(
DarkPatternIssue(
pattern="scroll_dismissal",
severity="critical",
message="Consent banner dismissed on scroll — this is not valid consent.",
recommendation=(
"Disable dismiss-on-scroll. Under GDPR, scrolling does not "
"constitute valid consent. The banner must remain until the user "
"makes an explicit choice."
),
)
)
except Exception as exc:
logger.debug("Scroll dismissal check failed: %s", exc)
return issues
async def detect_dark_patterns(page: Page) -> DarkPatternResult:
"""Run all dark pattern checks on the current page."""
url = page.url
result = DarkPatternResult(url=url)
try:
result.banner_found = await _find_banner(page)
if not result.banner_found:
return result
# Run all checks
result.issues.extend(await check_button_prominence(page))
result.issues.extend(await check_pre_ticked_boxes(page))
result.issues.extend(await check_cookie_wall(page))
result.issues.extend(await check_scroll_dismissal(page))
except Exception as exc:
result.error = str(exc)
logger.warning("Dark pattern detection failed for %s: %s", url, exc)
return result

119
apps/scanner/src/sitemap.py Normal file
View File

@@ -0,0 +1,119 @@
"""Sitemap parser for URL discovery.
Fetches and parses XML sitemaps (including sitemap indexes) to discover
URLs for crawling. Falls back to common page paths if no sitemap exists.
"""
from __future__ import annotations
import logging
from xml.etree import ElementTree
import httpx
logger = logging.getLogger(__name__)
# XML namespace used in sitemaps
_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Common page paths to try when no sitemap is available
_DEFAULT_PATHS = [
"/",
"/about",
"/contact",
"/privacy",
"/privacy-policy",
"/terms",
"/cookie-policy",
]
async def discover_urls(
domain: str,
*,
max_urls: int = 50,
timeout: float = 10.0,
) -> list[str]:
"""Discover URLs for a domain via sitemap or fallback paths.
Attempts to fetch /sitemap.xml first. If that fails, tries
/robots.txt for a Sitemap directive. Falls back to default paths.
"""
base = f"https://{domain}"
urls: list[str] = []
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
verify=False, # noqa: S501 — scanning may target sites with self-signed certs
) as client:
# Try sitemap.xml
sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
if sitemap_urls:
return sitemap_urls[:max_urls]
# Try robots.txt for Sitemap directive
sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
if sitemap_url:
sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
if sitemap_urls:
return sitemap_urls[:max_urls]
# Fallback to default paths
urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
return urls[:max_urls]
async def _fetch_sitemap(
client: httpx.AsyncClient,
url: str,
max_urls: int,
) -> list[str]:
"""Fetch and parse an XML sitemap. Handles sitemap indexes."""
try:
resp = await client.get(url)
if resp.status_code != 200:
return []
root = ElementTree.fromstring(resp.text)
# Check if it's a sitemap index
sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
if sitemaps:
urls: list[str] = []
for sm_loc in sitemaps:
if sm_loc.text:
child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
urls.extend(child_urls)
if len(urls) >= max_urls:
break
return urls[:max_urls]
# Regular sitemap — extract <loc> URLs
locs = root.findall("sm:url/sm:loc", _NS)
return [loc.text for loc in locs if loc.text][:max_urls]
except Exception as exc:
logger.debug("Failed to fetch sitemap %s: %s", url, exc)
return []
async def _find_sitemap_in_robots(
client: httpx.AsyncClient,
robots_url: str,
) -> str | None:
"""Look for a Sitemap directive in robots.txt."""
try:
resp = await client.get(robots_url)
if resp.status_code != 200:
return None
for line in resp.text.splitlines():
stripped = line.strip()
if stripped.lower().startswith("sitemap:"):
return stripped.split(":", 1)[1].strip()
except Exception:
pass
return None

379
apps/scanner/src/worker.py Normal file
View File

@@ -0,0 +1,379 @@
"""Scanner HTTP service.
Exposes an HTTP endpoint that accepts scan requests, runs the Playwright
cookie crawler, and returns discovered cookies. Called by the API's Celery
worker to execute scan jobs.
"""
from __future__ import annotations
import logging
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
logger = logging.getLogger(__name__)
# ── Settings ─────────────────────────────────────────────────────────
class ScannerSettings(BaseSettings):
"""Scanner service settings from environment."""
model_config = SettingsConfigDict(env_file=".env", case_sensitive=False)
host: str = "0.0.0.0"
port: int = 8001
log_level: str = "INFO"
crawler_timeout_ms: int = 30_000
crawler_headless: bool = True
max_pages_per_scan: int = 50
# ── Request / Response schemas ───────────────────────────────────────
class ProxyRequest(BaseModel):
"""Proxy configuration for geo-located scanning."""
server: str
username: str | None = None
password: str | None = None
class ScanRequest(BaseModel):
"""Incoming scan request from the API worker."""
domain: str
urls: list[str] = Field(default_factory=list)
max_pages: int = 50
proxy: ProxyRequest | None = None
class DiscoveredCookieResponse(BaseModel):
"""A single cookie found during crawling."""
name: str
domain: str
storage_type: str = "cookie"
path: str | None = None
expires: float | None = None
http_only: bool | None = None
secure: bool | None = None
same_site: str | None = None
value_length: int = 0
script_source: str | None = None
page_url: str = ""
initiator_chain: list[str] = Field(default_factory=list)
class ScanResponse(BaseModel):
"""Result of a scan."""
domain: str
pages_crawled: int
total_cookies: int
cookies: list[DiscoveredCookieResponse]
errors: list[str] = Field(default_factory=list)
class ValidationRequest(BaseModel):
"""Request for consent validation and dark pattern detection."""
url: str
essential_cookie_names: list[str] = Field(default_factory=list)
proxy: ProxyRequest | None = None
class ValidationIssueResponse(BaseModel):
"""A single validation issue."""
check: str
severity: str
message: str
recommendation: str
details: dict = Field(default_factory=dict)
class DarkPatternIssueResponse(BaseModel):
"""A detected dark pattern."""
pattern: str
severity: str
message: str
recommendation: str
details: dict = Field(default_factory=dict)
class ValidationResponse(BaseModel):
"""Result of consent validation and dark pattern detection."""
url: str
pre_consent_issues: list[ValidationIssueResponse] = Field(default_factory=list)
post_accept_issues: list[ValidationIssueResponse] = Field(default_factory=list)
post_reject_issues: list[ValidationIssueResponse] = Field(default_factory=list)
dark_pattern_issues: list[DarkPatternIssueResponse] = Field(default_factory=list)
banner_found: bool = False
errors: list[str] = Field(default_factory=list)
# ── Application ──────────────────────────────────────────────────────
def create_app(): # noqa: ANN201
"""Create the scanner FastAPI application."""
from fastapi import FastAPI, HTTPException
from src.crawler import CookieCrawler
from src.sitemap import discover_urls
app = FastAPI(title="CMP Scanner Service", version="0.1.0")
settings = ScannerSettings()
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
@app.post("/scan", response_model=ScanResponse)
async def run_scan(body: ScanRequest) -> ScanResponse:
"""Execute a Playwright crawl and return discovered cookies."""
# Discover URLs if none provided
urls = body.urls
if not urls:
try:
urls = await discover_urls(
body.domain, max_urls=min(body.max_pages, settings.max_pages_per_scan)
)
except Exception as exc:
logger.warning("URL discovery failed for %s: %s", body.domain, exc)
urls = [f"https://{body.domain}/"]
if not urls:
raise HTTPException(status_code=400, detail="No URLs to scan")
# Run crawler
from src.crawler import ProxyConfig
proxy_config = None
if body.proxy:
proxy_config = ProxyConfig(
server=body.proxy.server,
username=body.proxy.username,
password=body.proxy.password,
)
crawler = CookieCrawler(
headless=settings.crawler_headless,
timeout_ms=settings.crawler_timeout_ms,
proxy=proxy_config,
)
result = await crawler.crawl_site(
urls, max_pages=min(body.max_pages, settings.max_pages_per_scan)
)
# Build response
cookies = [
DiscoveredCookieResponse(
name=c.name,
domain=c.domain,
storage_type=c.storage_type,
path=c.path,
expires=c.expires,
http_only=c.http_only,
secure=c.secure,
same_site=c.same_site,
value_length=c.value_length,
script_source=c.script_source,
page_url=c.page_url,
initiator_chain=c.initiator_chain,
)
for c in result.unique_cookies
]
errors = [p.error for p in result.pages if p.error]
return ScanResponse(
domain=result.domain,
pages_crawled=len(result.pages),
total_cookies=result.total_cookies_found,
cookies=cookies,
errors=errors,
)
@app.post("/validate", response_model=ValidationResponse)
async def run_validation(body: ValidationRequest) -> ValidationResponse:
"""Run consent signal validation and dark pattern detection."""
from playwright.async_api import async_playwright
from src.consent_validator import (
_is_tracker_request,
validate_post_accept,
validate_post_reject,
validate_pre_consent,
)
from src.crawler import ProxyConfig
from src.dark_pattern_detector import detect_dark_patterns
response = ValidationResponse(url=body.url)
essential_names = set(body.essential_cookie_names)
tracker_requests: list[str] = []
proxy_config = None
if body.proxy:
proxy_config = ProxyConfig(
server=body.proxy.server,
username=body.proxy.username,
password=body.proxy.password,
)
try:
async with async_playwright() as pw:
launch_kwargs: dict = {"headless": settings.crawler_headless}
if proxy_config:
proxy_opts: dict = {"server": proxy_config.server}
if proxy_config.username:
proxy_opts["username"] = proxy_config.username
if proxy_config.password:
proxy_opts["password"] = proxy_config.password
launch_kwargs["proxy"] = proxy_opts
browser = await pw.chromium.launch(**launch_kwargs)
try:
context = await browser.new_context(ignore_https_errors=True)
page = await context.new_page()
# Track network requests for tracker detection
def _on_request(request) -> None:
if _is_tracker_request(request.url):
tracker_requests.append(request.url)
page.on("request", _on_request)
# ── Pre-consent check ────────────────────────
await page.goto(
body.url,
wait_until="networkidle",
timeout=settings.crawler_timeout_ms,
)
pre_issues = await validate_pre_consent(
page, context, essential_names, tracker_requests
)
response.pre_consent_issues = [
ValidationIssueResponse(**vars(i)) for i in pre_issues
]
# ── Dark pattern detection ───────────────────
dp_result = await detect_dark_patterns(page)
response.banner_found = dp_result.banner_found
response.dark_pattern_issues = [
DarkPatternIssueResponse(**vars(i)) for i in dp_result.issues
]
# ── Post-accept check ────────────────────────
# Try to click Accept All
accept_selectors = [
"button:has-text('Accept All')",
"button:has-text('Accept')",
"button:has-text('Allow All')",
"button:has-text('I Agree')",
"[data-action='accept']",
]
accepted = False
for selector in accept_selectors:
try:
btn = page.locator(selector).first
if await btn.is_visible(timeout=1000):
await btn.click()
await page.wait_for_timeout(2000)
accepted = True
break
except Exception:
continue
if accepted:
tracker_requests.clear()
post_accept = await validate_post_accept(page, context)
response.post_accept_issues = [
ValidationIssueResponse(**vars(i)) for i in post_accept
]
# ── Post-reject check ────────────────────────
# Reload and reject
await context.clear_cookies()
tracker_requests.clear()
await page.goto(
body.url,
wait_until="networkidle",
timeout=settings.crawler_timeout_ms,
)
reject_selectors = [
"button:has-text('Reject All')",
"button:has-text('Reject')",
"button:has-text('Decline')",
"button:has-text('Deny')",
"[data-action='reject']",
]
rejected = False
for selector in reject_selectors:
try:
btn = page.locator(selector).first
if await btn.is_visible(timeout=1000):
await btn.click()
await page.wait_for_timeout(2000)
rejected = True
break
except Exception:
continue
if rejected:
post_reject_trackers: list[str] = []
# Collect any new tracker requests after rejection
for req_url in tracker_requests:
if _is_tracker_request(req_url):
post_reject_trackers.append(req_url)
post_reject = await validate_post_reject(
page, context, essential_names, post_reject_trackers
)
response.post_reject_issues = [
ValidationIssueResponse(**vars(i)) for i in post_reject
]
await context.close()
finally:
await browser.close()
except Exception as exc:
response.errors.append(str(exc))
logger.warning("Validation failed for %s: %s", body.url, exc)
return response
return app
# ── Entrypoint ───────────────────────────────────────────────────────
def main() -> None:
"""Run the scanner service with uvicorn."""
import uvicorn
settings = ScannerSettings()
logging.basicConfig(level=settings.log_level)
uvicorn.run(
"src.worker:create_app",
factory=True,
host=settings.host,
port=settings.port,
workers=1, # Single worker — Playwright manages its own concurrency
access_log=True,
)
if __name__ == "__main__":
main()