fix(scanner): pre-seed ConsentOS consent so crawls see post-consent state (#2)

* fix(scanner): pre-seed accepted ConsentOS consent before crawling A site running ConsentOS exposes one set of cookies before consent (strictly necessary only) and a much larger set after the visitor accepts analytics/marketing/personalisation. The scanner is meant to answer "what does this site actually load?" — but because the crawler clears cookies and navigates without ever interacting with the banner, every scan returned the pre-consent view. Useful for spotting trackers that fire before consent (which is what ``consent_validator.py`` does), useless for the cookie inventory the admin UI exists to display. Plant ``_consentos_consent`` on the browser context with all categories accepted before ``page.goto``. The cookie payload mirrors ``apps/banner/src/consent.ts:writeConsent`` exactly (URL-encoded ``ConsentState`` JSON, ``Lax`` SameSite, year-long expiry) so the loader's ``readConsent`` short-circuits straight to ``updateAcceptedCategories(['necessary','functional','analytics', 'marketing','personalisation'])`` — the blocker is bypassed and the crawl sees what the visitor would see. Pre-consent compliance checks live in ``consent_validator.py`` and use a separate code path; this change only touches the cookie inventory crawl. * style: ruff format crawler.py
2026-04-14 14:05:35 +01:00
parent 8d15ec4398
commit 0fbe2717f2
2 changed files with 152 additions and 6 deletions
--- a/apps/scanner/src/crawler.py
+++ b/apps/scanner/src/crawler.py
@@ -1,16 +1,28 @@
 """Playwright-based headless browser cookie crawler.

-For each URL: launches headless Chromium, clears cookies, navigates,
-waits for network idle, enumerates document.cookie / localStorage /
-sessionStorage, captures Set-Cookie headers from network requests,
-and attributes cookies to source scripts via the request chain.
+For each URL: launches headless Chromium, **pre-seeds an
+"all categories accepted" ConsentOS consent cookie**, clears any other
+cookies, navigates, waits for network idle, enumerates
+``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
+``Set-Cookie`` headers from network requests, and attributes cookies
+to source scripts via the request chain.
+
+The pre-seed is what makes the scan useful: without it the loader
+would block analytics/marketing scripts and the scan would only see
+strictly-necessary cookies, which tells you nothing about what the
+site actually loads in the post-consent state. Pre-consent compliance
+checks live in ``consent_validator.py`` and use a separate code path.
 """

 from __future__ import annotations

+import json
 import logging
+import time
+import uuid
 from dataclasses import dataclass, field
-from urllib.parse import urlparse
+from datetime import UTC, datetime
+from urllib.parse import quote, urlparse

 from playwright.async_api import (
    BrowserContext,
@@ -22,6 +34,50 @@ from playwright.async_api import (

 logger = logging.getLogger(__name__)

+# All ConsentOS categories — pre-seeded as accepted on every crawl so
+# the loader's "consent already given" branch fires and unblocks all
+# scripts/cookies.
+_ALL_CATEGORIES: list[str] = [
+    "necessary",
+    "functional",
+    "analytics",
+    "marketing",
+    "personalisation",
+]
+
+# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
+# rename it there, rename it here too.
+_CONSENT_COOKIE_NAME = "_consentos_consent"
+
+
+def _build_consent_cookie(url: str) -> dict:
+    """Return a Playwright cookie dict pre-seeding ConsentOS consent.
+
+    Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
+    produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
+    ``readConsent`` returns a valid object and short-circuits straight
+    to ``updateAcceptedCategories(...)``. Categories are hard-coded to
+    every known ConsentOS category; the scanner is a "what does this
+    site load when the visitor accepts everything?" tool, by design.
+    """
+    state = {
+        "visitorId": str(uuid.uuid4()),
+        "accepted": _ALL_CATEGORIES,
+        "rejected": [],
+        "consentedAt": datetime.now(UTC).isoformat(),
+        "bannerVersion": "scanner",
+    }
+    value = quote(json.dumps(state, separators=(",", ":")), safe="")
+    return {
+        "name": _CONSENT_COOKIE_NAME,
+        "value": value,
+        "url": url,
+        "path": "/",
+        "expires": time.time() + 365 * 86400,
+        "sameSite": "Lax",
+    }
+
+
 # Realistic Chrome UA so sites don't block the crawler as a bot.
 _DEFAULT_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -152,8 +208,13 @@ class CookieCrawler:
                user_agent=self._user_agent,
                ignore_https_errors=True,
            )
-            # Clear all cookies before visiting
+            # Start from a clean slate, then plant the ConsentOS consent
+            # cookie so the loader treats the visitor as having already
+            # accepted every category. Without this the scan only sees
+            # strictly-necessary cookies — useless for "what does this
+            # site actually load?" reporting.
            await context.clear_cookies()
+            await context.add_cookies([_build_consent_cookie(url)])

            page: Page = await context.new_page()