From 0fbe2717f24639b3766e0d33ee0a7a55b8aa672c Mon Sep 17 00:00:00 2001
From: James Cottrill <32595786+jamescottrill@users.noreply.github.com>
Date: Tue, 14 Apr 2026 14:05:35 +0100
Subject: [PATCH] fix(scanner): pre-seed ConsentOS consent so crawls see
 post-consent state (#2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(scanner): pre-seed accepted ConsentOS consent before crawling

A site running ConsentOS exposes one set of cookies before consent
(strictly necessary only) and a much larger set after the visitor
accepts analytics/marketing/personalisation. The scanner is meant to
answer "what does this site actually load?" — but because the crawler
clears cookies and navigates without ever interacting with the
banner, every scan returned the pre-consent view. Useful for spotting
trackers that fire before consent (which is what
``consent_validator.py`` does), useless for the cookie inventory the
admin UI exists to display.

Plant ``_consentos_consent`` on the browser context with all
categories accepted before ``page.goto``. The cookie payload mirrors
``apps/banner/src/consent.ts:writeConsent`` exactly (URL-encoded
``ConsentState`` JSON, ``Lax`` SameSite, year-long expiry) so the
loader's ``readConsent`` short-circuits straight to
``updateAcceptedCategories(['necessary','functional','analytics',
'marketing','personalisation'])`` — the blocker is bypassed and the
crawl sees what the visitor would see.

Pre-consent compliance checks live in ``consent_validator.py`` and
use a separate code path; this change only touches the cookie
inventory crawl.

* style: ruff format crawler.py
---
 apps/scanner/src/crawler.py        | 73 ++++++++++++++++++++++---
 apps/scanner/tests/test_crawler.py | 85 ++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/apps/scanner/src/crawler.py b/apps/scanner/src/crawler.py
index f998df8..c87f5fb 100644
--- a/apps/scanner/src/crawler.py
+++ b/apps/scanner/src/crawler.py
@@ -1,16 +1,28 @@
 """Playwright-based headless browser cookie crawler.
 
-For each URL: launches headless Chromium, clears cookies, navigates,
-waits for network idle, enumerates document.cookie / localStorage /
-sessionStorage, captures Set-Cookie headers from network requests,
-and attributes cookies to source scripts via the request chain.
+For each URL: launches headless Chromium, **pre-seeds an
+"all categories accepted" ConsentOS consent cookie**, clears any other
+cookies, navigates, waits for network idle, enumerates
+``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
+``Set-Cookie`` headers from network requests, and attributes cookies
+to source scripts via the request chain.
+
+The pre-seed is what makes the scan useful: without it the loader
+would block analytics/marketing scripts and the scan would only see
+strictly-necessary cookies, which tells you nothing about what the
+site actually loads in the post-consent state. Pre-consent compliance
+checks live in ``consent_validator.py`` and use a separate code path.
 """
 
 from __future__ import annotations
 
+import json
 import logging
+import time
+import uuid
 from dataclasses import dataclass, field
-from urllib.parse import urlparse
+from datetime import UTC, datetime
+from urllib.parse import quote, urlparse
 
 from playwright.async_api import (
     BrowserContext,
@@ -22,6 +34,50 @@ from playwright.async_api import (
 
 logger = logging.getLogger(__name__)
 
+# All ConsentOS categories — pre-seeded as accepted on every crawl so
+# the loader's "consent already given" branch fires and unblocks all
+# scripts/cookies.
+_ALL_CATEGORIES: list[str] = [
+    "necessary",
+    "functional",
+    "analytics",
+    "marketing",
+    "personalisation",
+]
+
+# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
+# rename it there, rename it here too.
+_CONSENT_COOKIE_NAME = "_consentos_consent"
+
+
+def _build_consent_cookie(url: str) -> dict:
+    """Return a Playwright cookie dict pre-seeding ConsentOS consent.
+
+    Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
+    produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
+    ``readConsent`` returns a valid object and short-circuits straight
+    to ``updateAcceptedCategories(...)``. Categories are hard-coded to
+    every known ConsentOS category; the scanner is a "what does this
+    site load when the visitor accepts everything?" tool, by design.
+    """
+    state = {
+        "visitorId": str(uuid.uuid4()),
+        "accepted": _ALL_CATEGORIES,
+        "rejected": [],
+        "consentedAt": datetime.now(UTC).isoformat(),
+        "bannerVersion": "scanner",
+    }
+    value = quote(json.dumps(state, separators=(",", ":")), safe="")
+    return {
+        "name": _CONSENT_COOKIE_NAME,
+        "value": value,
+        "url": url,
+        "path": "/",
+        "expires": time.time() + 365 * 86400,
+        "sameSite": "Lax",
+    }
+
+
 # Realistic Chrome UA so sites don't block the crawler as a bot.
 _DEFAULT_USER_AGENT = (
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -152,8 +208,13 @@ class CookieCrawler:
                 user_agent=self._user_agent,
                 ignore_https_errors=True,
             )
-            # Clear all cookies before visiting
+            # Start from a clean slate, then plant the ConsentOS consent
+            # cookie so the loader treats the visitor as having already
+            # accepted every category. Without this the scan only sees
+            # strictly-necessary cookies — useless for "what does this
+            # site actually load?" reporting.
             await context.clear_cookies()
+            await context.add_cookies([_build_consent_cookie(url)])
 
             page: Page = await context.new_page()
 
diff --git a/apps/scanner/tests/test_crawler.py b/apps/scanner/tests/test_crawler.py
index 509f127..797c08b 100644
--- a/apps/scanner/tests/test_crawler.py
+++ b/apps/scanner/tests/test_crawler.py
@@ -8,10 +8,13 @@ from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 
 from src.crawler import (
+    _ALL_CATEGORIES,
+    _CONSENT_COOKIE_NAME,
     CookieCrawler,
     CrawlResult,
     DiscoveredCookie,
     SiteCrawlResult,
+    _build_consent_cookie,
     _build_initiator_chain,
     _get_script_initiator,
 )
@@ -438,3 +441,85 @@ class TestCrawlSite:
         await crawler.crawl_site(["https://example.com/"])
 
         browser.close.assert_awaited_once()
+
+
+# ── Consent pre-seed ────────────────────────────────────────────────────
+
+
+class TestBuildConsentCookie:
+    """The pre-seeded ``_consentos_consent`` cookie."""
+
+    def test_cookie_name_matches_loader(self):
+        cookie = _build_consent_cookie("https://example.com/")
+        assert cookie["name"] == _CONSENT_COOKIE_NAME == "_consentos_consent"
+
+    def test_cookie_is_url_scoped_for_playwright(self):
+        """``url`` lets Playwright derive domain / path / secure."""
+        cookie = _build_consent_cookie("https://example.com/page")
+        assert cookie["url"] == "https://example.com/page"
+        assert cookie["path"] == "/"
+
+    def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
+        import json as _json
+        from urllib.parse import unquote
+
+        cookie = _build_consent_cookie("https://example.com/")
+        state = _json.loads(unquote(cookie["value"]))
+
+        assert sorted(state["accepted"]) == sorted(_ALL_CATEGORIES)
+        assert state["rejected"] == []
+        # ConsentState fields the loader's readConsent() relies on
+        assert "visitorId" in state
+        assert "consentedAt" in state
+        assert "bannerVersion" in state
+
+    def test_cookie_expires_far_in_future(self):
+        import time as _time
+
+        cookie = _build_consent_cookie("https://example.com/")
+        # ~1 year, allow generous slack for test timing
+        assert cookie["expires"] > _time.time() + 300 * 86400
+
+    @pytest.mark.asyncio(loop_scope="session")
+    @patch("src.crawler.async_playwright")
+    async def test_crawl_seeds_consent_before_navigation(self, mock_pw):
+        """``add_cookies`` must be called before ``page.goto``."""
+        page = _make_mock_page()
+        context = _make_mock_context(page)
+        browser = _make_mock_browser(context)
+
+        # Track call order on the context
+        call_order: list[str] = []
+        original_add = context.add_cookies
+        original_clear = context.clear_cookies
+
+        async def _add(*args, **kwargs):
+            call_order.append("add_cookies")
+            return await original_add(*args, **kwargs)
+
+        async def _clear(*args, **kwargs):
+            call_order.append("clear_cookies")
+            return await original_clear(*args, **kwargs)
+
+        async def _goto(*args, **kwargs):
+            call_order.append("goto")
+
+        context.add_cookies = AsyncMock(side_effect=_add)
+        context.clear_cookies = AsyncMock(side_effect=_clear)
+        page.goto = AsyncMock(side_effect=_goto)
+
+        pw_instance = AsyncMock()
+        pw_instance.chromium.launch = AsyncMock(return_value=browser)
+        mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
+        mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
+
+        crawler = CookieCrawler()
+        await crawler.crawl_site(["https://example.com/"])
+
+        assert call_order == ["clear_cookies", "add_cookies", "goto"], call_order
+
+        # And the cookie payload was the one we expect
+        seeded = context.add_cookies.call_args.args[0]
+        assert len(seeded) == 1
+        assert seeded[0]["name"] == "_consentos_consent"
+        assert seeded[0]["url"] == "https://example.com/"