From 0fbe2717f24639b3766e0d33ee0a7a55b8aa672c Mon Sep 17 00:00:00 2001 From: James Cottrill <32595786+jamescottrill@users.noreply.github.com> Date: Tue, 14 Apr 2026 14:05:35 +0100 Subject: [PATCH] fix(scanner): pre-seed ConsentOS consent so crawls see post-consent state (#2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(scanner): pre-seed accepted ConsentOS consent before crawling A site running ConsentOS exposes one set of cookies before consent (strictly necessary only) and a much larger set after the visitor accepts analytics/marketing/personalisation. The scanner is meant to answer "what does this site actually load?" — but because the crawler clears cookies and navigates without ever interacting with the banner, every scan returned the pre-consent view. Useful for spotting trackers that fire before consent (which is what ``consent_validator.py`` does), useless for the cookie inventory the admin UI exists to display. Plant ``_consentos_consent`` on the browser context with all categories accepted before ``page.goto``. The cookie payload mirrors ``apps/banner/src/consent.ts:writeConsent`` exactly (URL-encoded ``ConsentState`` JSON, ``Lax`` SameSite, year-long expiry) so the loader's ``readConsent`` short-circuits straight to ``updateAcceptedCategories(['necessary','functional','analytics', 'marketing','personalisation'])`` — the blocker is bypassed and the crawl sees what the visitor would see. Pre-consent compliance checks live in ``consent_validator.py`` and use a separate code path; this change only touches the cookie inventory crawl. * style: ruff format crawler.py --- apps/scanner/src/crawler.py | 73 ++++++++++++++++++++++--- apps/scanner/tests/test_crawler.py | 85 ++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 6 deletions(-) diff --git a/apps/scanner/src/crawler.py b/apps/scanner/src/crawler.py index f998df8..c87f5fb 100644 --- a/apps/scanner/src/crawler.py +++ b/apps/scanner/src/crawler.py @@ -1,16 +1,28 @@ """Playwright-based headless browser cookie crawler. -For each URL: launches headless Chromium, clears cookies, navigates, -waits for network idle, enumerates document.cookie / localStorage / -sessionStorage, captures Set-Cookie headers from network requests, -and attributes cookies to source scripts via the request chain. +For each URL: launches headless Chromium, **pre-seeds an +"all categories accepted" ConsentOS consent cookie**, clears any other +cookies, navigates, waits for network idle, enumerates +``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures +``Set-Cookie`` headers from network requests, and attributes cookies +to source scripts via the request chain. + +The pre-seed is what makes the scan useful: without it the loader +would block analytics/marketing scripts and the scan would only see +strictly-necessary cookies, which tells you nothing about what the +site actually loads in the post-consent state. Pre-consent compliance +checks live in ``consent_validator.py`` and use a separate code path. """ from __future__ import annotations +import json import logging +import time +import uuid from dataclasses import dataclass, field -from urllib.parse import urlparse +from datetime import UTC, datetime +from urllib.parse import quote, urlparse from playwright.async_api import ( BrowserContext, @@ -22,6 +34,50 @@ from playwright.async_api import ( logger = logging.getLogger(__name__) +# All ConsentOS categories — pre-seeded as accepted on every crawl so +# the loader's "consent already given" branch fires and unblocks all +# scripts/cookies. +_ALL_CATEGORIES: list[str] = [ + "necessary", + "functional", + "analytics", + "marketing", + "personalisation", +] + +# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you +# rename it there, rename it here too. +_CONSENT_COOKIE_NAME = "_consentos_consent" + + +def _build_consent_cookie(url: str) -> dict: + """Return a Playwright cookie dict pre-seeding ConsentOS consent. + + Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent`` + produces — URL-encoded JSON of a ``ConsentState`` — so the loader's + ``readConsent`` returns a valid object and short-circuits straight + to ``updateAcceptedCategories(...)``. Categories are hard-coded to + every known ConsentOS category; the scanner is a "what does this + site load when the visitor accepts everything?" tool, by design. + """ + state = { + "visitorId": str(uuid.uuid4()), + "accepted": _ALL_CATEGORIES, + "rejected": [], + "consentedAt": datetime.now(UTC).isoformat(), + "bannerVersion": "scanner", + } + value = quote(json.dumps(state, separators=(",", ":")), safe="") + return { + "name": _CONSENT_COOKIE_NAME, + "value": value, + "url": url, + "path": "/", + "expires": time.time() + 365 * 86400, + "sameSite": "Lax", + } + + # Realistic Chrome UA so sites don't block the crawler as a bot. _DEFAULT_USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " @@ -152,8 +208,13 @@ class CookieCrawler: user_agent=self._user_agent, ignore_https_errors=True, ) - # Clear all cookies before visiting + # Start from a clean slate, then plant the ConsentOS consent + # cookie so the loader treats the visitor as having already + # accepted every category. Without this the scan only sees + # strictly-necessary cookies — useless for "what does this + # site actually load?" reporting. await context.clear_cookies() + await context.add_cookies([_build_consent_cookie(url)]) page: Page = await context.new_page() diff --git a/apps/scanner/tests/test_crawler.py b/apps/scanner/tests/test_crawler.py index 509f127..797c08b 100644 --- a/apps/scanner/tests/test_crawler.py +++ b/apps/scanner/tests/test_crawler.py @@ -8,10 +8,13 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest from src.crawler import ( + _ALL_CATEGORIES, + _CONSENT_COOKIE_NAME, CookieCrawler, CrawlResult, DiscoveredCookie, SiteCrawlResult, + _build_consent_cookie, _build_initiator_chain, _get_script_initiator, ) @@ -438,3 +441,85 @@ class TestCrawlSite: await crawler.crawl_site(["https://example.com/"]) browser.close.assert_awaited_once() + + +# ── Consent pre-seed ──────────────────────────────────────────────────── + + +class TestBuildConsentCookie: + """The pre-seeded ``_consentos_consent`` cookie.""" + + def test_cookie_name_matches_loader(self): + cookie = _build_consent_cookie("https://example.com/") + assert cookie["name"] == _CONSENT_COOKIE_NAME == "_consentos_consent" + + def test_cookie_is_url_scoped_for_playwright(self): + """``url`` lets Playwright derive domain / path / secure.""" + cookie = _build_consent_cookie("https://example.com/page") + assert cookie["url"] == "https://example.com/page" + assert cookie["path"] == "/" + + def test_cookie_value_decodes_to_consent_state_with_all_categories(self): + import json as _json + from urllib.parse import unquote + + cookie = _build_consent_cookie("https://example.com/") + state = _json.loads(unquote(cookie["value"])) + + assert sorted(state["accepted"]) == sorted(_ALL_CATEGORIES) + assert state["rejected"] == [] + # ConsentState fields the loader's readConsent() relies on + assert "visitorId" in state + assert "consentedAt" in state + assert "bannerVersion" in state + + def test_cookie_expires_far_in_future(self): + import time as _time + + cookie = _build_consent_cookie("https://example.com/") + # ~1 year, allow generous slack for test timing + assert cookie["expires"] > _time.time() + 300 * 86400 + + @pytest.mark.asyncio(loop_scope="session") + @patch("src.crawler.async_playwright") + async def test_crawl_seeds_consent_before_navigation(self, mock_pw): + """``add_cookies`` must be called before ``page.goto``.""" + page = _make_mock_page() + context = _make_mock_context(page) + browser = _make_mock_browser(context) + + # Track call order on the context + call_order: list[str] = [] + original_add = context.add_cookies + original_clear = context.clear_cookies + + async def _add(*args, **kwargs): + call_order.append("add_cookies") + return await original_add(*args, **kwargs) + + async def _clear(*args, **kwargs): + call_order.append("clear_cookies") + return await original_clear(*args, **kwargs) + + async def _goto(*args, **kwargs): + call_order.append("goto") + + context.add_cookies = AsyncMock(side_effect=_add) + context.clear_cookies = AsyncMock(side_effect=_clear) + page.goto = AsyncMock(side_effect=_goto) + + pw_instance = AsyncMock() + pw_instance.chromium.launch = AsyncMock(return_value=browser) + mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance) + mock_pw.return_value.__aexit__ = AsyncMock(return_value=False) + + crawler = CookieCrawler() + await crawler.crawl_site(["https://example.com/"]) + + assert call_order == ["clear_cookies", "add_cookies", "goto"], call_order + + # And the cookie payload was the one we expect + seeded = context.add_cookies.call_args.args[0] + assert len(seeded) == 1 + assert seeded[0]["name"] == "_consentos_consent" + assert seeded[0]["url"] == "https://example.com/"