fix(scanner): pre-seed ConsentOS consent so crawls see post-consent state (#2)

* fix(scanner): pre-seed accepted ConsentOS consent before crawling A site running ConsentOS exposes one set of cookies before consent (strictly necessary only) and a much larger set after the visitor accepts analytics/marketing/personalisation. The scanner is meant to answer "what does this site actually load?" — but because the crawler clears cookies and navigates without ever interacting with the banner, every scan returned the pre-consent view. Useful for spotting trackers that fire before consent (which is what ``consent_validator.py`` does), useless for the cookie inventory the admin UI exists to display. Plant ``_consentos_consent`` on the browser context with all categories accepted before ``page.goto``. The cookie payload mirrors ``apps/banner/src/consent.ts:writeConsent`` exactly (URL-encoded ``ConsentState`` JSON, ``Lax`` SameSite, year-long expiry) so the loader's ``readConsent`` short-circuits straight to ``updateAcceptedCategories(['necessary','functional','analytics', 'marketing','personalisation'])`` — the blocker is bypassed and the crawl sees what the visitor would see. Pre-consent compliance checks live in ``consent_validator.py`` and use a separate code path; this change only touches the cookie inventory crawl. * style: ruff format crawler.py
2026-04-14 14:05:35 +01:00
parent 8d15ec4398
commit 0fbe2717f2
2 changed files with 152 additions and 6 deletions
--- a/apps/scanner/src/crawler.py
+++ b/apps/scanner/src/crawler.py
@@ -1,16 +1,28 @@
 """Playwright-based headless browser cookie crawler.
-For each URL: launches headless Chromium, clears cookies, navigates,
+For each URL: launches headless Chromium, **pre-seeds an
-waits for network idle, enumerates document.cookie / localStorage /
+"all categories accepted" ConsentOS consent cookie**, clears any other
-sessionStorage, captures Set-Cookie headers from network requests,
+cookies, navigates, waits for network idle, enumerates
-and attributes cookies to source scripts via the request chain.
+``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
 ``Set-Cookie`` headers from network requests, and attributes cookies
 to source scripts via the request chain.
 The pre-seed is what makes the scan useful: without it the loader
 would block analytics/marketing scripts and the scan would only see
 strictly-necessary cookies, which tells you nothing about what the
 site actually loads in the post-consent state. Pre-consent compliance
 checks live in ``consent_validator.py`` and use a separate code path.
 """
 from __future__ import annotations
 import json
 import logging
 import time
 import uuid
 from dataclasses import dataclass, field
-from urllib.parse import urlparse
+from datetime import UTC, datetime
 from urllib.parse import quote, urlparse
 from playwright.async_api import (
    BrowserContext,
@@ -22,6 +34,50 @@ from playwright.async_api import (
 logger = logging.getLogger(__name__)
 # All ConsentOS categories — pre-seeded as accepted on every crawl so
 # the loader's "consent already given" branch fires and unblocks all
 # scripts/cookies.
 _ALL_CATEGORIES: list[str] = [
    "necessary",
    "functional",
    "analytics",
    "marketing",
    "personalisation",
 ]
 # Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
 # rename it there, rename it here too.
 _CONSENT_COOKIE_NAME = "_consentos_consent"
 def _build_consent_cookie(url: str) -> dict:
    """Return a Playwright cookie dict pre-seeding ConsentOS consent.
    Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
    produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
    ``readConsent`` returns a valid object and short-circuits straight
    to ``updateAcceptedCategories(...)``. Categories are hard-coded to
    every known ConsentOS category; the scanner is a "what does this
    site load when the visitor accepts everything?" tool, by design.
    """
    state = {
        "visitorId": str(uuid.uuid4()),
        "accepted": _ALL_CATEGORIES,
        "rejected": [],
        "consentedAt": datetime.now(UTC).isoformat(),
        "bannerVersion": "scanner",
    }
    value = quote(json.dumps(state, separators=(",", ":")), safe="")
    return {
        "name": _CONSENT_COOKIE_NAME,
        "value": value,
        "url": url,
        "path": "/",
        "expires": time.time() + 365 * 86400,
        "sameSite": "Lax",
    }
 # Realistic Chrome UA so sites don't block the crawler as a bot.
 _DEFAULT_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -152,8 +208,13 @@ class CookieCrawler:
                user_agent=self._user_agent,
                ignore_https_errors=True,
            )
-            # Clear all cookies before visiting
+            # Start from a clean slate, then plant the ConsentOS consent
            # cookie so the loader treats the visitor as having already
            # accepted every category. Without this the scan only sees
            # strictly-necessary cookies — useless for "what does this
            # site actually load?" reporting.
            await context.clear_cookies()
            await context.add_cookies([_build_consent_cookie(url)])
            page: Page = await context.new_page()
--- a/apps/scanner/tests/test_crawler.py
+++ b/apps/scanner/tests/test_crawler.py
@@ -8,10 +8,13 @@ from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from src.crawler import (
    _ALL_CATEGORIES,
    _CONSENT_COOKIE_NAME,
    CookieCrawler,
    CrawlResult,
    DiscoveredCookie,
    SiteCrawlResult,
    _build_consent_cookie,
    _build_initiator_chain,
    _get_script_initiator,
 )
@@ -438,3 +441,85 @@ class TestCrawlSite:
        await crawler.crawl_site(["https://example.com/"])
        browser.close.assert_awaited_once()
 # ── Consent pre-seed ────────────────────────────────────────────────────
 class TestBuildConsentCookie:
    """The pre-seeded ``_consentos_consent`` cookie."""
    def test_cookie_name_matches_loader(self):
        cookie = _build_consent_cookie("https://example.com/")
        assert cookie["name"] == _CONSENT_COOKIE_NAME == "_consentos_consent"
    def test_cookie_is_url_scoped_for_playwright(self):
        """``url`` lets Playwright derive domain / path / secure."""
        cookie = _build_consent_cookie("https://example.com/page")
        assert cookie["url"] == "https://example.com/page"
        assert cookie["path"] == "/"
    def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
        import json as _json
        from urllib.parse import unquote
        cookie = _build_consent_cookie("https://example.com/")
        state = _json.loads(unquote(cookie["value"]))
        assert sorted(state["accepted"]) == sorted(_ALL_CATEGORIES)
        assert state["rejected"] == []
        # ConsentState fields the loader's readConsent() relies on
        assert "visitorId" in state
        assert "consentedAt" in state
        assert "bannerVersion" in state
    def test_cookie_expires_far_in_future(self):
        import time as _time
        cookie = _build_consent_cookie("https://example.com/")
        # ~1 year, allow generous slack for test timing
        assert cookie["expires"] > _time.time() + 300 * 86400
    @pytest.mark.asyncio(loop_scope="session")
    @patch("src.crawler.async_playwright")
    async def test_crawl_seeds_consent_before_navigation(self, mock_pw):
        """``add_cookies`` must be called before ``page.goto``."""
        page = _make_mock_page()
        context = _make_mock_context(page)
        browser = _make_mock_browser(context)
        # Track call order on the context
        call_order: list[str] = []
        original_add = context.add_cookies
        original_clear = context.clear_cookies
        async def _add(*args, **kwargs):
            call_order.append("add_cookies")
            return await original_add(*args, **kwargs)
        async def _clear(*args, **kwargs):
            call_order.append("clear_cookies")
            return await original_clear(*args, **kwargs)
        async def _goto(*args, **kwargs):
            call_order.append("goto")
        context.add_cookies = AsyncMock(side_effect=_add)
        context.clear_cookies = AsyncMock(side_effect=_clear)
        page.goto = AsyncMock(side_effect=_goto)
        pw_instance = AsyncMock()
        pw_instance.chromium.launch = AsyncMock(return_value=browser)
        mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
        mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
        crawler = CookieCrawler()
        await crawler.crawl_site(["https://example.com/"])
        assert call_order == ["clear_cookies", "add_cookies", "goto"], call_order
        # And the cookie payload was the one we expect
        seeded = context.add_cookies.call_args.args[0]
        assert len(seeded) == 1
        assert seeded[0]["name"] == "_consentos_consent"
        assert seeded[0]["url"] == "https://example.com/"