feat: initial public release

ConsentOS — a privacy-first cookie consent management platform. Self-hosted, source-available alternative to OneTrust, Cookiebot, and CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant architecture with role-based access, configuration cascade (system → org → group → site → region), dark-pattern detection in the scanner, and a tamper-evident consent record audit trail. This is the initial public release. Prior development history is retained internally. See README.md for the feature list, architecture overview, and quick-start instructions. Licensed under the Elastic Licence 2.0 — self-host freely; do not resell as a managed service.
2026-04-13 14:20:15 +00:00
commit fbf26453f2
341 changed files with 62807 additions and 0 deletions
--- a/apps/scanner/src/init.py
+++ b/apps/scanner/src/init.py
--- a/apps/scanner/src/classifier.py
+++ b/apps/scanner/src/classifier.py
@@ -0,0 +1,107 @@
+"""Cookie classification based on known patterns.
+
+Matches discovered cookies against a database of known cookie patterns
+to auto-categorise them (analytics, marketing, functional, etc.).
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+
+@dataclass
+class KnownPattern:
+    """A known cookie pattern for classification."""
+
+    name_pattern: str
+    domain_pattern: str
+    category: str
+    vendor: str | None = None
+    is_regex: bool = False
+
+
+@dataclass
+class ClassificationResult:
+    """Result of classifying a cookie."""
+
+    category: str | None
+    vendor: str | None = None
+    match_source: str = "unmatched"  # exact | wildcard | regex | unmatched
+
+
+def classify_cookie(
+    name: str,
+    domain: str,
+    patterns: list[KnownPattern],
+) -> ClassificationResult:
+    """Classify a cookie by matching against known patterns.
+
+    Matching priority:
+    1. Exact name match
+    2. Wildcard match (patterns containing *)
+    3. Regex match (patterns flagged as regex)
+    """
+    for pattern in patterns:
+        if pattern.is_regex:
+            continue  # Skip regex in first pass
+
+        if "*" in pattern.name_pattern:
+            # Wildcard match
+            regex = pattern.name_pattern.replace(".", r"\.").replace("*", ".*")
+            if re.match(f"^{regex}$", name, re.IGNORECASE):
+                if _domain_matches(domain, pattern.domain_pattern):
+                    return ClassificationResult(
+                        category=pattern.category,
+                        vendor=pattern.vendor,
+                        match_source="wildcard",
+                    )
+        elif pattern.name_pattern == name:
+            # Exact match
+            if _domain_matches(domain, pattern.domain_pattern):
+                return ClassificationResult(
+                    category=pattern.category,
+                    vendor=pattern.vendor,
+                    match_source="exact",
+                )
+
+    # Regex pass
+    for pattern in patterns:
+        if not pattern.is_regex:
+            continue
+        try:
+            if re.match(pattern.name_pattern, name, re.IGNORECASE):
+                if _domain_matches(domain, pattern.domain_pattern):
+                    return ClassificationResult(
+                        category=pattern.category,
+                        vendor=pattern.vendor,
+                        match_source="regex",
+                    )
+        except re.error:
+            continue
+
+    return ClassificationResult(category=None, match_source="unmatched")
+
+
+def _domain_matches(actual: str, pattern: str) -> bool:
+    """Check if a domain matches a pattern.
+
+    Patterns can be:
+    - "*" — matches any domain
+    - ".example.com" — matches example.com and *.example.com
+    - "example.com" — exact match
+    """
+    if pattern == "*":
+        return True
+
+    actual = actual.lower().lstrip(".")
+    pattern = pattern.lower().lstrip(".")
+
+    if actual == pattern:
+        return True
+
+    # Subdomain match: actual "sub.example.com" matches pattern "example.com"
+    if actual.endswith(f".{pattern}"):
+        return True
+
+    return False
--- a/apps/scanner/src/consent_validator.py
+++ b/apps/scanner/src/consent_validator.py
@@ -0,0 +1,280 @@
+"""Consent signal validation — Playwright-based runtime checks.
+
+Validates that consent signals (GCM, TCF, GPP) work correctly at runtime
+by checking pre-consent, post-accept, and post-reject states.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+
+from playwright.async_api import BrowserContext, Page
+
+logger = logging.getLogger(__name__)
+
+# Known tracker domains for pixel-fire detection
+KNOWN_TRACKER_DOMAINS = frozenset(
+    {
+        "google-analytics.com",
+        "googletagmanager.com",
+        "doubleclick.net",
+        "facebook.net",
+        "facebook.com",
+        "connect.facebook.net",
+        "analytics.tiktok.com",
+        "snap.licdn.com",
+        "bat.bing.com",
+        "clarity.ms",
+        "hotjar.com",
+        "mouseflow.com",
+        "cdn.segment.com",
+        "cdn.mxpnl.com",
+        "plausible.io",
+        "px.ads.linkedin.com",
+    }
+)
+
+
+@dataclass
+class ConsentSignalState:
+    """Captured consent signal state from the page."""
+
+    gcm_state: dict | None = None
+    tcf_data: dict | None = None
+    gpp_data: dict | None = None
+
+
+@dataclass
+class ValidationIssue:
+    """A single consent validation issue."""
+
+    check: str
+    severity: str  # critical, warning, info
+    message: str
+    recommendation: str
+    details: dict = field(default_factory=dict)
+
+
+@dataclass
+class ValidationResult:
+    """Result of consent signal validation for a page."""
+
+    url: str
+    pre_consent_issues: list[ValidationIssue] = field(default_factory=list)
+    post_accept_issues: list[ValidationIssue] = field(default_factory=list)
+    post_reject_issues: list[ValidationIssue] = field(default_factory=list)
+    error: str | None = None
+
+    @property
+    def all_issues(self) -> list[ValidationIssue]:
+        return self.pre_consent_issues + self.post_accept_issues + self.post_reject_issues
+
+    @property
+    def has_issues(self) -> bool:
+        return bool(self.all_issues)
+
+
+async def _get_consent_signals(page: Page) -> ConsentSignalState:
+    """Extract current consent signal state from the page."""
+    state = ConsentSignalState()
+
+    # Read GCM state
+    try:
+        gcm = await page.evaluate("""() => {
+            try {
+                if (window.dataLayer) {
+                    const consentEvents = window.dataLayer.filter(
+                        e => e[0] === 'consent' || (e.event && e.event.includes('consent'))
+                    );
+                    return { dataLayer: consentEvents, available: true };
+                }
+                return { available: false };
+            } catch (e) { return { error: e.message }; }
+        }""")
+        state.gcm_state = gcm
+    except Exception:
+        pass
+
+    # Read TCF state
+    try:
+        tcf = await page.evaluate("""() => {
+            return new Promise((resolve) => {
+                if (typeof window.__tcfapi === 'function') {
+                    window.__tcfapi('getTCData', 2, (data, success) => {
+                        resolve({ available: true, success, data: data || null });
+                    });
+                } else {
+                    resolve({ available: false });
+                }
+            });
+        }""")
+        state.tcf_data = tcf
+    except Exception:
+        pass
+
+    # Read GPP state
+    try:
+        gpp = await page.evaluate("""() => {
+            return new Promise((resolve) => {
+                if (typeof window.__gpp === 'function') {
+                    window.__gpp('getGPPData', (data, success) => {
+                        resolve({ available: true, success, data: data || null });
+                    });
+                } else {
+                    resolve({ available: false });
+                }
+            });
+        }""")
+        state.gpp_data = gpp
+    except Exception:
+        pass
+
+    return state
+
+
+async def _get_cookies_from_context(context: BrowserContext) -> list[dict]:
+    """Get all cookies from the browser context."""
+    return await context.cookies()
+
+
+def _is_tracker_request(url: str) -> bool:
+    """Check if a URL belongs to a known tracker domain."""
+    for domain in KNOWN_TRACKER_DOMAINS:
+        if domain in url:
+            return True
+    return False
+
+
+async def validate_pre_consent(
+    page: Page,
+    context: BrowserContext,
+    essential_cookie_names: set[str],
+    tracker_requests: list[str],
+) -> list[ValidationIssue]:
+    """Validate that no non-essential activity occurs before consent."""
+    issues: list[ValidationIssue] = []
+
+    # Check cookies — only essential should be set
+    cookies = await _get_cookies_from_context(context)
+    non_essential = [c for c in cookies if c["name"] not in essential_cookie_names]
+    if non_essential:
+        names = [c["name"] for c in non_essential]
+        issues.append(
+            ValidationIssue(
+                check="pre_consent_cookies",
+                severity="critical",
+                message=(
+                    f"{len(non_essential)} non-essential cookie(s) set before consent: "
+                    f"{', '.join(names[:5])}"
+                ),
+                recommendation=(
+                    "Ensure all non-essential cookies are blocked until consent is given."
+                ),
+                details={"cookies": names},
+            )
+        )
+
+    # Check tracker requests
+    tracker_hits = [url for url in tracker_requests if _is_tracker_request(url)]
+    if tracker_hits:
+        issues.append(
+            ValidationIssue(
+                check="pre_consent_trackers",
+                severity="critical",
+                message=f"{len(tracker_hits)} tracking request(s) fired before consent.",
+                recommendation="Block all tracking scripts until the user grants consent.",
+                details={"tracker_urls": tracker_hits[:10]},
+            )
+        )
+
+    # Check GCM defaults
+    signals = await _get_consent_signals(page)
+    if signals.gcm_state and signals.gcm_state.get("available"):
+        # GCM should show denied for non-essential types
+        pass  # GCM state captured for reporting
+
+    # Check TCF — no purpose consents should be active
+    if signals.tcf_data and signals.tcf_data.get("available"):
+        tcf_data = signals.tcf_data.get("data") or {}
+        purpose_consents = tcf_data.get("purpose", {}).get("consents", {})
+        granted_purposes = [k for k, v in purpose_consents.items() if v]
+        if granted_purposes:
+            issues.append(
+                ValidationIssue(
+                    check="pre_consent_tcf",
+                    severity="critical",
+                    message=f"TCF purpose consents active before user action: {granted_purposes}",
+                    recommendation="TCF should report no purpose consents until user grants them.",
+                    details={"granted_purposes": granted_purposes},
+                )
+            )
+
+    return issues
+
+
+async def validate_post_accept(
+    page: Page,
+    context: BrowserContext,
+) -> list[ValidationIssue]:
+    """Validate consent signals after Accept All is clicked."""
+    issues: list[ValidationIssue] = []
+
+    signals = await _get_consent_signals(page)
+
+    # Check TCF — purposes should now be consented
+    if signals.tcf_data and signals.tcf_data.get("available"):
+        if not signals.tcf_data.get("success"):
+            issues.append(
+                ValidationIssue(
+                    check="post_accept_tcf",
+                    severity="warning",
+                    message="TCF getTCData returned unsuccessful after Accept All.",
+                    recommendation=("Verify TCF API returns valid TC data after consent."),
+                )
+            )
+
+    return issues
+
+
+async def validate_post_reject(
+    page: Page,
+    context: BrowserContext,
+    essential_cookie_names: set[str],
+    tracker_requests: list[str],
+) -> list[ValidationIssue]:
+    """Validate that rejection is respected — no tracking after reject."""
+    issues: list[ValidationIssue] = []
+
+    # Check cookies after reject
+    cookies = await _get_cookies_from_context(context)
+    non_essential = [c for c in cookies if c["name"] not in essential_cookie_names]
+    if non_essential:
+        names = [c["name"] for c in non_essential]
+        issues.append(
+            ValidationIssue(
+                check="post_reject_cookies",
+                severity="critical",
+                message=(
+                    f"{len(non_essential)} non-essential cookie(s) remain after rejection: "
+                    f"{', '.join(names[:5])}"
+                ),
+                recommendation="Ensure all non-essential cookies are removed when user rejects.",
+                details={"cookies": names},
+            )
+        )
+
+    # Check tracker requests after reject
+    tracker_hits = [url for url in tracker_requests if _is_tracker_request(url)]
+    if tracker_hits:
+        issues.append(
+            ValidationIssue(
+                check="post_reject_trackers",
+                severity="critical",
+                message=f"{len(tracker_hits)} tracking request(s) fired after rejection.",
+                recommendation="Ensure tracking scripts respect rejection and do not fire.",
+                details={"tracker_urls": tracker_hits[:10]},
+            )
+        )
+
+    return issues
--- a/apps/scanner/src/crawler.py
+++ b/apps/scanner/src/crawler.py
@@ -0,0 +1,335 @@
+"""Playwright-based headless browser cookie crawler.
+
+For each URL: launches headless Chromium, clears cookies, navigates,
+waits for network idle, enumerates document.cookie / localStorage /
+sessionStorage, captures Set-Cookie headers from network requests,
+and attributes cookies to source scripts via the request chain.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from urllib.parse import urlparse
+
+from playwright.async_api import (
+    BrowserContext,
+    Page,
+    Request,
+    Response,
+    async_playwright,
+)
+
+logger = logging.getLogger(__name__)
+
+# Realistic Chrome UA so sites don't block the crawler as a bot.
+_DEFAULT_USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/131.0.0.0 Safari/537.36"
+)
+
+
+@dataclass
+class DiscoveredCookie:
+    """A cookie or storage item found during a crawl."""
+
+    name: str
+    domain: str
+    storage_type: str = "cookie"  # cookie | local_storage | session_storage
+    path: str | None = None
+    expires: float | None = None
+    http_only: bool | None = None
+    secure: bool | None = None
+    same_site: str | None = None
+    value_length: int = 0
+    script_source: str | None = None
+    page_url: str = ""
+    initiator_chain: list[str] = field(default_factory=list)
+
+
+@dataclass
+class CrawlResult:
+    """Result of crawling a single page."""
+
+    url: str
+    cookies: list[DiscoveredCookie] = field(default_factory=list)
+    error: str | None = None
+
+
+@dataclass
+class SiteCrawlResult:
+    """Aggregated result of crawling all pages on a site."""
+
+    domain: str
+    pages: list[CrawlResult] = field(default_factory=list)
+    total_cookies_found: int = 0
+
+    @property
+    def unique_cookies(self) -> list[DiscoveredCookie]:
+        """Deduplicate cookies across pages by (name, domain, storage_type)."""
+        seen: dict[tuple[str, str, str], DiscoveredCookie] = {}
+        for page in self.pages:
+            for cookie in page.cookies:
+                key = (cookie.name, cookie.domain, cookie.storage_type)
+                if key not in seen:
+                    seen[key] = cookie
+        return list(seen.values())
+
+
+@dataclass
+class ProxyConfig:
+    """Proxy configuration for geo-located scanning."""
+
+    server: str  # e.g. "http://proxy-eu.example.com:8080"
+    username: str | None = None
+    password: str | None = None
+
+
+class CookieCrawler:
+    """Crawls a site using Playwright to discover cookies and storage items."""
+
+    def __init__(
+        self,
+        *,
+        headless: bool = True,
+        timeout_ms: int = 30_000,
+        user_agent: str = _DEFAULT_USER_AGENT,
+        proxy: ProxyConfig | None = None,
+    ) -> None:
+        self._headless = headless
+        self._timeout_ms = timeout_ms
+        self._user_agent = user_agent
+        self._proxy = proxy
+
+    async def crawl_site(
+        self,
+        urls: list[str],
+        *,
+        max_pages: int = 50,
+    ) -> SiteCrawlResult:
+        """Crawl multiple URLs and aggregate cookie discoveries."""
+        if not urls:
+            return SiteCrawlResult(domain="")
+
+        domain = urlparse(urls[0]).hostname or ""
+        result = SiteCrawlResult(domain=domain)
+
+        async with async_playwright() as pw:
+            launch_kwargs: dict = {"headless": self._headless}
+            if self._proxy:
+                proxy_opts: dict = {"server": self._proxy.server}
+                if self._proxy.username:
+                    proxy_opts["username"] = self._proxy.username
+                if self._proxy.password:
+                    proxy_opts["password"] = self._proxy.password
+                launch_kwargs["proxy"] = proxy_opts
+            browser = await pw.chromium.launch(**launch_kwargs)
+            try:
+                for url in urls[:max_pages]:
+                    page_result = await self._crawl_page(browser, url)
+                    result.pages.append(page_result)
+                    result.total_cookies_found += len(page_result.cookies)
+            finally:
+                await browser.close()
+
+        return result
+
+    async def _crawl_page(
+        self,
+        browser: Browser,  # noqa: F821
+        url: str,
+    ) -> CrawlResult:
+        """Crawl a single page and discover cookies."""
+        result = CrawlResult(url=url)
+        script_cookies: dict[str, str] = {}  # cookie name → script URL
+        initiator_map: dict[str, str] = {}  # request URL → initiating URL
+        initiator_chains: dict[str, list[str]] = {}  # cookie name → chain
+
+        context: BrowserContext | None = None
+        try:
+            context = await browser.new_context(
+                user_agent=self._user_agent,
+                ignore_https_errors=True,
+            )
+            # Clear all cookies before visiting
+            await context.clear_cookies()
+
+            page: Page = await context.new_page()
+
+            # Track request initiator chains via frame URL and redirect chains
+            def _on_request(request: Request) -> None:
+                try:
+                    req_url = request.url
+                    # Follow redirect chain to find the original initiator
+                    redirected = request.redirected_from
+                    if redirected:
+                        initiator_map[req_url] = redirected.url
+                    else:
+                        # Use the frame URL as the parent initiator
+                        frame_url = request.frame.url if request.frame else ""
+                        if frame_url and frame_url != req_url:
+                            initiator_map[req_url] = frame_url
+                except Exception:
+                    pass  # Non-critical — request introspection may fail
+
+            page.on("request", _on_request)
+
+            # Track Set-Cookie headers from responses
+            async def _on_response(response: Response) -> None:
+                try:
+                    headers = await response.all_headers()
+                    set_cookie = headers.get("set-cookie", "")
+                    if set_cookie:
+                        # Attribute cookie to the initiating script
+                        request: Request = response.request
+                        initiator = _get_script_initiator(request)
+                        # Build the initiator chain for this request
+                        chain = _build_initiator_chain(request.url, initiator_map)
+                        for cookie_str in set_cookie.split("\n"):
+                            name = cookie_str.split("=")[0].strip()
+                            if name:
+                                if initiator:
+                                    script_cookies[name] = initiator
+                                initiator_chains[name] = chain
+                except Exception:
+                    pass  # Non-critical — response may have been aborted
+
+            page.on("response", _on_response)
+
+            # Navigate
+            await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms)
+            # Allow additional time for scripts to set cookies after DOM load.
+            await page.wait_for_timeout(3000)
+
+            # Enumerate browser cookies via CDP
+            cdp_cookies = await context.cookies()
+            for c in cdp_cookies:
+                result.cookies.append(
+                    DiscoveredCookie(
+                        name=c["name"],
+                        domain=c["domain"],
+                        storage_type="cookie",
+                        path=c.get("path"),
+                        expires=c.get("expires"),
+                        http_only=c.get("httpOnly"),
+                        secure=c.get("secure"),
+                        same_site=c.get("sameSite"),
+                        value_length=len(c.get("value", "")),
+                        script_source=script_cookies.get(c["name"]),
+                        page_url=url,
+                        initiator_chain=initiator_chains.get(c["name"], []),
+                    )
+                )
+
+            # Enumerate localStorage
+            ls_items = await page.evaluate("""() => {
+                const items = [];
+                try {
+                    for (let i = 0; i < localStorage.length; i++) {
+                        const key = localStorage.key(i);
+                        if (key) {
+                            items.push({
+                                name: key,
+                                valueLength: (localStorage.getItem(key) || '').length,
+                            });
+                        }
+                    }
+                } catch (e) {}
+                return items;
+            }""")
+            hostname = urlparse(url).hostname or ""
+            for item in ls_items:
+                result.cookies.append(
+                    DiscoveredCookie(
+                        name=item["name"],
+                        domain=hostname,
+                        storage_type="local_storage",
+                        value_length=item["valueLength"],
+                        page_url=url,
+                    )
+                )
+
+            # Enumerate sessionStorage
+            ss_items = await page.evaluate("""() => {
+                const items = [];
+                try {
+                    for (let i = 0; i < sessionStorage.length; i++) {
+                        const key = sessionStorage.key(i);
+                        if (key) {
+                            items.push({
+                                name: key,
+                                valueLength: (sessionStorage.getItem(key) || '').length,
+                            });
+                        }
+                    }
+                } catch (e) {}
+                return items;
+            }""")
+            for item in ss_items:
+                result.cookies.append(
+                    DiscoveredCookie(
+                        name=item["name"],
+                        domain=hostname,
+                        storage_type="session_storage",
+                        value_length=item["valueLength"],
+                        page_url=url,
+                    )
+                )
+
+        except Exception as exc:
+            result.error = str(exc)
+            logger.warning("Failed to crawl %s: %s", url, exc)
+        finally:
+            if context:
+                await context.close()
+
+        return result
+
+
+def _get_script_initiator(request: Request) -> str | None:
+    """Walk the request chain to find the originating script URL.
+
+    Returns a single script URL for backwards compatibility. For the full
+    initiator path, use :func:`_build_initiator_chain` instead.
+    """
+    seen: set[str] = set()
+    current = request
+    while current:
+        url = current.url
+        if url in seen:
+            break
+        seen.add(url)
+        if url.endswith(".js") or "javascript" in (current.resource_type or ""):
+            return url
+        redirected = current.redirected_from
+        if redirected:
+            current = redirected
+        else:
+            break
+    return None
+
+
+def _build_initiator_chain(
+    url: str,
+    initiator_map: dict[str, str],
+    max_depth: int = 20,
+) -> list[str]:
+    """Build the full initiator chain from a URL back to the root.
+
+    Walks the initiator map from *url* towards the top-level page,
+    producing a list ordered root-first (i.e. the page URL at index 0
+    and the leaf request URL at the end).
+    """
+    chain = [url]
+    seen: set[str] = {url}
+    current = url
+    for _ in range(max_depth):
+        parent = initiator_map.get(current, "")
+        if not parent or parent in seen:
+            break
+        chain.append(parent)
+        seen.add(parent)
+        current = parent
+    chain.reverse()  # Root first
+    return chain
--- a/apps/scanner/src/dark_pattern_detector.py
+++ b/apps/scanner/src/dark_pattern_detector.py
@@ -0,0 +1,348 @@
+"""Dark pattern detection — CSS and DOM analysis of consent banners.
+
+Detects manipulative UI patterns in cookie consent banners:
+- Unequal button prominence (Accept bigger/brighter than Reject)
+- Pre-ticked category checkboxes
+- Missing first-layer Reject button (CNIL violation)
+- Cookie walls (blocking page content)
+- Dismiss-on-scroll (not valid consent under GDPR)
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+
+from playwright.async_api import Page
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DarkPatternIssue:
+    """A detected dark pattern in the consent banner."""
+
+    pattern: str
+    severity: str  # critical, warning, info
+    message: str
+    recommendation: str
+    details: dict = field(default_factory=dict)
+
+
+@dataclass
+class DarkPatternResult:
+    """Result of dark pattern analysis."""
+
+    url: str
+    issues: list[DarkPatternIssue] = field(default_factory=list)
+    banner_found: bool = False
+    error: str | None = None
+
+
+# Common selectors for consent banner elements
+BANNER_SELECTORS = [
+    "[id*='cookie']",
+    "[id*='consent']",
+    "[class*='cookie']",
+    "[class*='consent']",
+    "[id*='cmp']",
+    "[class*='cmp']",
+    "[role='dialog'][aria-label*='cookie' i]",
+    "[role='dialog'][aria-label*='consent' i]",
+]
+
+ACCEPT_BUTTON_SELECTORS = [
+    "button:has-text('Accept')",
+    "button:has-text('Accept All')",
+    "button:has-text('Allow')",
+    "button:has-text('Allow All')",
+    "button:has-text('I Agree')",
+    "button:has-text('OK')",
+    "button:has-text('Got it')",
+    "[data-action='accept']",
+    "[id*='accept']",
+]
+
+REJECT_BUTTON_SELECTORS = [
+    "button:has-text('Reject')",
+    "button:has-text('Reject All')",
+    "button:has-text('Decline')",
+    "button:has-text('Deny')",
+    "button:has-text('Refuse')",
+    "button:has-text('Tout refuser')",
+    "[data-action='reject']",
+    "[id*='reject']",
+]
+
+
+async def _find_banner(page: Page) -> bool:
+    """Check if a consent banner is visible on the page."""
+    for selector in BANNER_SELECTORS:
+        try:
+            elements = await page.query_selector_all(selector)
+            for el in elements:
+                if await el.is_visible():
+                    return True
+        except Exception:
+            continue
+    return False
+
+
+async def _find_button(page: Page, selectors: list[str]) -> dict | None:
+    """Find a visible button matching one of the selectors, return its computed styles."""
+    for selector in selectors:
+        try:
+            elements = await page.query_selector_all(selector)
+            for el in elements:
+                if await el.is_visible():
+                    styles = await el.evaluate("""(el) => {
+                        const cs = window.getComputedStyle(el);
+                        const rect = el.getBoundingClientRect();
+                        return {
+                            width: rect.width,
+                            height: rect.height,
+                            area: rect.width * rect.height,
+                            backgroundColor: cs.backgroundColor,
+                            color: cs.color,
+                            fontSize: parseFloat(cs.fontSize),
+                            fontWeight: cs.fontWeight,
+                            padding: cs.padding,
+                            text: el.textContent.trim(),
+                            visible: true,
+                        };
+                    }""")
+                    return styles
+        except Exception:
+            continue
+    return None
+
+
+async def check_button_prominence(page: Page) -> list[DarkPatternIssue]:
+    """Compare Accept and Reject button sizes and visual weight."""
+    issues: list[DarkPatternIssue] = []
+
+    accept_btn = await _find_button(page, ACCEPT_BUTTON_SELECTORS)
+    reject_btn = await _find_button(page, REJECT_BUTTON_SELECTORS)
+
+    if not accept_btn:
+        return issues  # No accept button found — nothing to compare
+
+    if not reject_btn:
+        issues.append(
+            DarkPatternIssue(
+                pattern="missing_reject_button",
+                severity="critical",
+                message="No visible Reject/Decline button found on the first layer.",
+                recommendation=(
+                    "Add a clearly visible 'Reject All' button on the first layer "
+                    "of the consent banner, as required by GDPR and CNIL."
+                ),
+            )
+        )
+        return issues
+
+    # Compare button areas
+    accept_area = accept_btn.get("area", 0)
+    reject_area = reject_btn.get("area", 0)
+
+    if reject_area > 0 and accept_area > 0:
+        ratio = accept_area / reject_area
+        if ratio > 1.5:
+            issues.append(
+                DarkPatternIssue(
+                    pattern="unequal_button_size",
+                    severity="warning",
+                    message=(
+                        f"Accept button is {ratio:.1f}x larger than Reject button. "
+                        "Buttons should have equal prominence."
+                    ),
+                    recommendation=(
+                        "Make the Accept and Reject buttons the same size and visual weight."
+                    ),
+                    details={
+                        "accept_area": accept_area,
+                        "reject_area": reject_area,
+                        "ratio": round(ratio, 2),
+                    },
+                )
+            )
+
+    # Compare font sizes
+    accept_font = accept_btn.get("fontSize", 0)
+    reject_font = reject_btn.get("fontSize", 0)
+
+    if reject_font > 0 and accept_font > reject_font * 1.3:
+        issues.append(
+            DarkPatternIssue(
+                pattern="unequal_font_size",
+                severity="warning",
+                message=(
+                    f"Accept button font ({accept_font}px) is larger than "
+                    f"Reject button font ({reject_font}px)."
+                ),
+                recommendation="Use the same font size for both Accept and Reject buttons.",
+                details={
+                    "accept_font_size": accept_font,
+                    "reject_font_size": reject_font,
+                },
+            )
+        )
+
+    return issues
+
+
+async def check_pre_ticked_boxes(page: Page) -> list[DarkPatternIssue]:
+    """Check for pre-ticked non-essential category checkboxes."""
+    issues: list[DarkPatternIssue] = []
+
+    try:
+        pre_ticked = await page.evaluate("""() => {
+            const checkboxes = document.querySelectorAll(
+                'input[type="checkbox"][checked], input[type="checkbox"]:checked'
+            );
+            const results = [];
+            for (const cb of checkboxes) {
+                // Skip if it looks like an "essential" checkbox (often disabled)
+                if (cb.disabled) continue;
+                const label = cb.closest('label')?.textContent?.trim()
+                    || cb.getAttribute('aria-label')
+                    || cb.name
+                    || 'unknown';
+                // Skip checkboxes that appear to be for essential/necessary
+                const labelLower = label.toLowerCase();
+                if (labelLower.includes('essential') || labelLower.includes('necessary')
+                    || labelLower.includes('required') || labelLower.includes('strictly')) {
+                    continue;
+                }
+                results.push({ name: cb.name || cb.id, label: label });
+            }
+            return results;
+        }""")
+
+        if pre_ticked:
+            labels = [pt["label"][:50] for pt in pre_ticked]
+            issues.append(
+                DarkPatternIssue(
+                    pattern="pre_ticked_checkboxes",
+                    severity="critical",
+                    message=(
+                        f"{len(pre_ticked)} non-essential category checkbox(es) are pre-ticked: "
+                        f"{', '.join(labels[:3])}"
+                    ),
+                    recommendation=(
+                        "Non-essential category checkboxes must default to unchecked. "
+                        "Pre-ticked boxes do not constitute valid consent under GDPR."
+                    ),
+                    details={"checkboxes": pre_ticked},
+                )
+            )
+    except Exception as exc:
+        logger.debug("Pre-ticked checkbox check failed: %s", exc)
+
+    return issues
+
+
+async def check_cookie_wall(page: Page) -> list[DarkPatternIssue]:
+    """Check if a cookie wall blocks access to page content."""
+    issues: list[DarkPatternIssue] = []
+
+    try:
+        is_wall = await page.evaluate("""() => {
+            // Check for full-screen overlays blocking content
+            const overlays = document.querySelectorAll(
+                '[class*="overlay"], [class*="modal"], [class*="wall"]'
+            );
+            for (const overlay of overlays) {
+                const cs = window.getComputedStyle(overlay);
+                const rect = overlay.getBoundingClientRect();
+                // Full-viewport overlay with high z-index suggests a cookie wall
+                if (rect.width >= window.innerWidth * 0.9
+                    && rect.height >= window.innerHeight * 0.9
+                    && parseInt(cs.zIndex) > 100) {
+                    return true;
+                }
+            }
+            // Check if body/main is hidden or has overflow hidden
+            const body = document.body;
+            const bodyStyle = window.getComputedStyle(body);
+            if (bodyStyle.overflow === 'hidden' && bodyStyle.position === 'fixed') {
+                return true;
+            }
+            return false;
+        }""")
+
+        if is_wall:
+            issues.append(
+                DarkPatternIssue(
+                    pattern="cookie_wall",
+                    severity="critical",
+                    message="Cookie wall detected — page content appears blocked until consent.",
+                    recommendation=(
+                        "Remove the cookie wall. Users must be able to access the site "
+                        "without being forced to consent to non-essential cookies."
+                    ),
+                )
+            )
+    except Exception as exc:
+        logger.debug("Cookie wall check failed: %s", exc)
+
+    return issues
+
+
+async def check_scroll_dismissal(page: Page) -> list[DarkPatternIssue]:
+    """Check if scrolling dismisses the consent banner (not valid consent)."""
+    issues: list[DarkPatternIssue] = []
+
+    try:
+        # Check if banner is visible before scroll
+        banner_visible_before = await _find_banner(page)
+        if not banner_visible_before:
+            return issues
+
+        # Scroll down
+        await page.evaluate("window.scrollBy(0, 500)")
+        await page.wait_for_timeout(1000)
+
+        # Check if banner disappeared
+        banner_visible_after = await _find_banner(page)
+
+        if banner_visible_before and not banner_visible_after:
+            issues.append(
+                DarkPatternIssue(
+                    pattern="scroll_dismissal",
+                    severity="critical",
+                    message="Consent banner dismissed on scroll — this is not valid consent.",
+                    recommendation=(
+                        "Disable dismiss-on-scroll. Under GDPR, scrolling does not "
+                        "constitute valid consent. The banner must remain until the user "
+                        "makes an explicit choice."
+                    ),
+                )
+            )
+    except Exception as exc:
+        logger.debug("Scroll dismissal check failed: %s", exc)
+
+    return issues
+
+
+async def detect_dark_patterns(page: Page) -> DarkPatternResult:
+    """Run all dark pattern checks on the current page."""
+    url = page.url
+    result = DarkPatternResult(url=url)
+
+    try:
+        result.banner_found = await _find_banner(page)
+        if not result.banner_found:
+            return result
+
+        # Run all checks
+        result.issues.extend(await check_button_prominence(page))
+        result.issues.extend(await check_pre_ticked_boxes(page))
+        result.issues.extend(await check_cookie_wall(page))
+        result.issues.extend(await check_scroll_dismissal(page))
+
+    except Exception as exc:
+        result.error = str(exc)
+        logger.warning("Dark pattern detection failed for %s: %s", url, exc)
+
+    return result
--- a/apps/scanner/src/sitemap.py
+++ b/apps/scanner/src/sitemap.py
@@ -0,0 +1,119 @@
+"""Sitemap parser for URL discovery.
+
+Fetches and parses XML sitemaps (including sitemap indexes) to discover
+URLs for crawling. Falls back to common page paths if no sitemap exists.
+"""
+
+from __future__ import annotations
+
+import logging
+from xml.etree import ElementTree
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+# XML namespace used in sitemaps
+_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
+
+# Common page paths to try when no sitemap is available
+_DEFAULT_PATHS = [
+    "/",
+    "/about",
+    "/contact",
+    "/privacy",
+    "/privacy-policy",
+    "/terms",
+    "/cookie-policy",
+]
+
+
+async def discover_urls(
+    domain: str,
+    *,
+    max_urls: int = 50,
+    timeout: float = 10.0,
+) -> list[str]:
+    """Discover URLs for a domain via sitemap or fallback paths.
+
+    Attempts to fetch /sitemap.xml first. If that fails, tries
+    /robots.txt for a Sitemap directive. Falls back to default paths.
+    """
+    base = f"https://{domain}"
+    urls: list[str] = []
+
+    async with httpx.AsyncClient(
+        timeout=timeout,
+        follow_redirects=True,
+        verify=False,  # noqa: S501 — scanning may target sites with self-signed certs
+    ) as client:
+        # Try sitemap.xml
+        sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
+        if sitemap_urls:
+            return sitemap_urls[:max_urls]
+
+        # Try robots.txt for Sitemap directive
+        sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
+        if sitemap_url:
+            sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
+            if sitemap_urls:
+                return sitemap_urls[:max_urls]
+
+    # Fallback to default paths
+    urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
+    return urls[:max_urls]
+
+
+async def _fetch_sitemap(
+    client: httpx.AsyncClient,
+    url: str,
+    max_urls: int,
+) -> list[str]:
+    """Fetch and parse an XML sitemap. Handles sitemap indexes."""
+    try:
+        resp = await client.get(url)
+        if resp.status_code != 200:
+            return []
+
+        root = ElementTree.fromstring(resp.text)
+
+        # Check if it's a sitemap index
+        sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
+        if sitemaps:
+            urls: list[str] = []
+            for sm_loc in sitemaps:
+                if sm_loc.text:
+                    child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
+                    urls.extend(child_urls)
+                    if len(urls) >= max_urls:
+                        break
+            return urls[:max_urls]
+
+        # Regular sitemap — extract <loc> URLs
+        locs = root.findall("sm:url/sm:loc", _NS)
+        return [loc.text for loc in locs if loc.text][:max_urls]
+
+    except Exception as exc:
+        logger.debug("Failed to fetch sitemap %s: %s", url, exc)
+        return []
+
+
+async def _find_sitemap_in_robots(
+    client: httpx.AsyncClient,
+    robots_url: str,
+) -> str | None:
+    """Look for a Sitemap directive in robots.txt."""
+    try:
+        resp = await client.get(robots_url)
+        if resp.status_code != 200:
+            return None
+
+        for line in resp.text.splitlines():
+            stripped = line.strip()
+            if stripped.lower().startswith("sitemap:"):
+                return stripped.split(":", 1)[1].strip()
+
+    except Exception:
+        pass
+
+    return None
--- a/apps/scanner/src/worker.py
+++ b/apps/scanner/src/worker.py
@@ -0,0 +1,379 @@
+"""Scanner HTTP service.
+
+Exposes an HTTP endpoint that accepts scan requests, runs the Playwright
+cookie crawler, and returns discovered cookies. Called by the API's Celery
+worker to execute scan jobs.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+logger = logging.getLogger(__name__)
+
+
+# ── Settings ─────────────────────────────────────────────────────────
+
+
+class ScannerSettings(BaseSettings):
+    """Scanner service settings from environment."""
+
+    model_config = SettingsConfigDict(env_file=".env", case_sensitive=False)
+
+    host: str = "0.0.0.0"
+    port: int = 8001
+    log_level: str = "INFO"
+    crawler_timeout_ms: int = 30_000
+    crawler_headless: bool = True
+    max_pages_per_scan: int = 50
+
+
+# ── Request / Response schemas ───────────────────────────────────────
+
+
+class ProxyRequest(BaseModel):
+    """Proxy configuration for geo-located scanning."""
+
+    server: str
+    username: str | None = None
+    password: str | None = None
+
+
+class ScanRequest(BaseModel):
+    """Incoming scan request from the API worker."""
+
+    domain: str
+    urls: list[str] = Field(default_factory=list)
+    max_pages: int = 50
+    proxy: ProxyRequest | None = None
+
+
+class DiscoveredCookieResponse(BaseModel):
+    """A single cookie found during crawling."""
+
+    name: str
+    domain: str
+    storage_type: str = "cookie"
+    path: str | None = None
+    expires: float | None = None
+    http_only: bool | None = None
+    secure: bool | None = None
+    same_site: str | None = None
+    value_length: int = 0
+    script_source: str | None = None
+    page_url: str = ""
+    initiator_chain: list[str] = Field(default_factory=list)
+
+
+class ScanResponse(BaseModel):
+    """Result of a scan."""
+
+    domain: str
+    pages_crawled: int
+    total_cookies: int
+    cookies: list[DiscoveredCookieResponse]
+    errors: list[str] = Field(default_factory=list)
+
+
+class ValidationRequest(BaseModel):
+    """Request for consent validation and dark pattern detection."""
+
+    url: str
+    essential_cookie_names: list[str] = Field(default_factory=list)
+    proxy: ProxyRequest | None = None
+
+
+class ValidationIssueResponse(BaseModel):
+    """A single validation issue."""
+
+    check: str
+    severity: str
+    message: str
+    recommendation: str
+    details: dict = Field(default_factory=dict)
+
+
+class DarkPatternIssueResponse(BaseModel):
+    """A detected dark pattern."""
+
+    pattern: str
+    severity: str
+    message: str
+    recommendation: str
+    details: dict = Field(default_factory=dict)
+
+
+class ValidationResponse(BaseModel):
+    """Result of consent validation and dark pattern detection."""
+
+    url: str
+    pre_consent_issues: list[ValidationIssueResponse] = Field(default_factory=list)
+    post_accept_issues: list[ValidationIssueResponse] = Field(default_factory=list)
+    post_reject_issues: list[ValidationIssueResponse] = Field(default_factory=list)
+    dark_pattern_issues: list[DarkPatternIssueResponse] = Field(default_factory=list)
+    banner_found: bool = False
+    errors: list[str] = Field(default_factory=list)
+
+
+# ── Application ──────────────────────────────────────────────────────
+
+
+def create_app():  # noqa: ANN201
+    """Create the scanner FastAPI application."""
+    from fastapi import FastAPI, HTTPException
+
+    from src.crawler import CookieCrawler
+    from src.sitemap import discover_urls
+
+    app = FastAPI(title="CMP Scanner Service", version="0.1.0")
+    settings = ScannerSettings()
+
+    @app.get("/health")
+    async def health() -> dict[str, str]:
+        return {"status": "ok"}
+
+    @app.post("/scan", response_model=ScanResponse)
+    async def run_scan(body: ScanRequest) -> ScanResponse:
+        """Execute a Playwright crawl and return discovered cookies."""
+        # Discover URLs if none provided
+        urls = body.urls
+        if not urls:
+            try:
+                urls = await discover_urls(
+                    body.domain, max_urls=min(body.max_pages, settings.max_pages_per_scan)
+                )
+            except Exception as exc:
+                logger.warning("URL discovery failed for %s: %s", body.domain, exc)
+                urls = [f"https://{body.domain}/"]
+
+        if not urls:
+            raise HTTPException(status_code=400, detail="No URLs to scan")
+
+        # Run crawler
+        from src.crawler import ProxyConfig
+
+        proxy_config = None
+        if body.proxy:
+            proxy_config = ProxyConfig(
+                server=body.proxy.server,
+                username=body.proxy.username,
+                password=body.proxy.password,
+            )
+
+        crawler = CookieCrawler(
+            headless=settings.crawler_headless,
+            timeout_ms=settings.crawler_timeout_ms,
+            proxy=proxy_config,
+        )
+        result = await crawler.crawl_site(
+            urls, max_pages=min(body.max_pages, settings.max_pages_per_scan)
+        )
+
+        # Build response
+        cookies = [
+            DiscoveredCookieResponse(
+                name=c.name,
+                domain=c.domain,
+                storage_type=c.storage_type,
+                path=c.path,
+                expires=c.expires,
+                http_only=c.http_only,
+                secure=c.secure,
+                same_site=c.same_site,
+                value_length=c.value_length,
+                script_source=c.script_source,
+                page_url=c.page_url,
+                initiator_chain=c.initiator_chain,
+            )
+            for c in result.unique_cookies
+        ]
+
+        errors = [p.error for p in result.pages if p.error]
+
+        return ScanResponse(
+            domain=result.domain,
+            pages_crawled=len(result.pages),
+            total_cookies=result.total_cookies_found,
+            cookies=cookies,
+            errors=errors,
+        )
+
+    @app.post("/validate", response_model=ValidationResponse)
+    async def run_validation(body: ValidationRequest) -> ValidationResponse:
+        """Run consent signal validation and dark pattern detection."""
+        from playwright.async_api import async_playwright
+
+        from src.consent_validator import (
+            _is_tracker_request,
+            validate_post_accept,
+            validate_post_reject,
+            validate_pre_consent,
+        )
+        from src.crawler import ProxyConfig
+        from src.dark_pattern_detector import detect_dark_patterns
+
+        response = ValidationResponse(url=body.url)
+        essential_names = set(body.essential_cookie_names)
+        tracker_requests: list[str] = []
+
+        proxy_config = None
+        if body.proxy:
+            proxy_config = ProxyConfig(
+                server=body.proxy.server,
+                username=body.proxy.username,
+                password=body.proxy.password,
+            )
+
+        try:
+            async with async_playwright() as pw:
+                launch_kwargs: dict = {"headless": settings.crawler_headless}
+                if proxy_config:
+                    proxy_opts: dict = {"server": proxy_config.server}
+                    if proxy_config.username:
+                        proxy_opts["username"] = proxy_config.username
+                    if proxy_config.password:
+                        proxy_opts["password"] = proxy_config.password
+                    launch_kwargs["proxy"] = proxy_opts
+
+                browser = await pw.chromium.launch(**launch_kwargs)
+                try:
+                    context = await browser.new_context(ignore_https_errors=True)
+                    page = await context.new_page()
+
+                    # Track network requests for tracker detection
+                    def _on_request(request) -> None:
+                        if _is_tracker_request(request.url):
+                            tracker_requests.append(request.url)
+
+                    page.on("request", _on_request)
+
+                    # ── Pre-consent check ────────────────────────
+                    await page.goto(
+                        body.url,
+                        wait_until="networkidle",
+                        timeout=settings.crawler_timeout_ms,
+                    )
+
+                    pre_issues = await validate_pre_consent(
+                        page, context, essential_names, tracker_requests
+                    )
+                    response.pre_consent_issues = [
+                        ValidationIssueResponse(**vars(i)) for i in pre_issues
+                    ]
+
+                    # ── Dark pattern detection ───────────────────
+                    dp_result = await detect_dark_patterns(page)
+                    response.banner_found = dp_result.banner_found
+                    response.dark_pattern_issues = [
+                        DarkPatternIssueResponse(**vars(i)) for i in dp_result.issues
+                    ]
+
+                    # ── Post-accept check ────────────────────────
+                    # Try to click Accept All
+                    accept_selectors = [
+                        "button:has-text('Accept All')",
+                        "button:has-text('Accept')",
+                        "button:has-text('Allow All')",
+                        "button:has-text('I Agree')",
+                        "[data-action='accept']",
+                    ]
+                    accepted = False
+                    for selector in accept_selectors:
+                        try:
+                            btn = page.locator(selector).first
+                            if await btn.is_visible(timeout=1000):
+                                await btn.click()
+                                await page.wait_for_timeout(2000)
+                                accepted = True
+                                break
+                        except Exception:
+                            continue
+
+                    if accepted:
+                        tracker_requests.clear()
+                        post_accept = await validate_post_accept(page, context)
+                        response.post_accept_issues = [
+                            ValidationIssueResponse(**vars(i)) for i in post_accept
+                        ]
+
+                    # ── Post-reject check ────────────────────────
+                    # Reload and reject
+                    await context.clear_cookies()
+                    tracker_requests.clear()
+                    await page.goto(
+                        body.url,
+                        wait_until="networkidle",
+                        timeout=settings.crawler_timeout_ms,
+                    )
+
+                    reject_selectors = [
+                        "button:has-text('Reject All')",
+                        "button:has-text('Reject')",
+                        "button:has-text('Decline')",
+                        "button:has-text('Deny')",
+                        "[data-action='reject']",
+                    ]
+                    rejected = False
+                    for selector in reject_selectors:
+                        try:
+                            btn = page.locator(selector).first
+                            if await btn.is_visible(timeout=1000):
+                                await btn.click()
+                                await page.wait_for_timeout(2000)
+                                rejected = True
+                                break
+                        except Exception:
+                            continue
+
+                    if rejected:
+                        post_reject_trackers: list[str] = []
+                        # Collect any new tracker requests after rejection
+                        for req_url in tracker_requests:
+                            if _is_tracker_request(req_url):
+                                post_reject_trackers.append(req_url)
+
+                        post_reject = await validate_post_reject(
+                            page, context, essential_names, post_reject_trackers
+                        )
+                        response.post_reject_issues = [
+                            ValidationIssueResponse(**vars(i)) for i in post_reject
+                        ]
+
+                    await context.close()
+                finally:
+                    await browser.close()
+
+        except Exception as exc:
+            response.errors.append(str(exc))
+            logger.warning("Validation failed for %s: %s", body.url, exc)
+
+        return response
+
+    return app
+
+
+# ── Entrypoint ───────────────────────────────────────────────────────
+
+
+def main() -> None:
+    """Run the scanner service with uvicorn."""
+    import uvicorn
+
+    settings = ScannerSettings()
+    logging.basicConfig(level=settings.log_level)
+
+    uvicorn.run(
+        "src.worker:create_app",
+        factory=True,
+        host=settings.host,
+        port=settings.port,
+        workers=1,  # Single worker — Playwright manages its own concurrency
+        access_log=True,
+    )
+
+
+if __name__ == "__main__":
+    main()