fix(scanner): pre-seed ConsentOS consent so crawls see post-consent state (#2)

* fix(scanner): pre-seed accepted ConsentOS consent before crawling

A site running ConsentOS exposes one set of cookies before consent
(strictly necessary only) and a much larger set after the visitor
accepts analytics/marketing/personalisation. The scanner is meant to
answer "what does this site actually load?" — but because the crawler
clears cookies and navigates without ever interacting with the
banner, every scan returned the pre-consent view. Useful for spotting
trackers that fire before consent (which is what
``consent_validator.py`` does), useless for the cookie inventory the
admin UI exists to display.

Plant ``_consentos_consent`` on the browser context with all
categories accepted before ``page.goto``. The cookie payload mirrors
``apps/banner/src/consent.ts:writeConsent`` exactly (URL-encoded
``ConsentState`` JSON, ``Lax`` SameSite, year-long expiry) so the
loader's ``readConsent`` short-circuits straight to
``updateAcceptedCategories(['necessary','functional','analytics',
'marketing','personalisation'])`` — the blocker is bypassed and the
crawl sees what the visitor would see.

Pre-consent compliance checks live in ``consent_validator.py`` and
use a separate code path; this change only touches the cookie
inventory crawl.

* style: ruff format crawler.py
This commit is contained in:
James Cottrill
2026-04-14 14:05:35 +01:00
committed by GitHub
parent 8d15ec4398
commit 0fbe2717f2
2 changed files with 152 additions and 6 deletions

View File

@@ -1,16 +1,28 @@
"""Playwright-based headless browser cookie crawler. """Playwright-based headless browser cookie crawler.
For each URL: launches headless Chromium, clears cookies, navigates, For each URL: launches headless Chromium, **pre-seeds an
waits for network idle, enumerates document.cookie / localStorage / "all categories accepted" ConsentOS consent cookie**, clears any other
sessionStorage, captures Set-Cookie headers from network requests, cookies, navigates, waits for network idle, enumerates
and attributes cookies to source scripts via the request chain. ``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
``Set-Cookie`` headers from network requests, and attributes cookies
to source scripts via the request chain.
The pre-seed is what makes the scan useful: without it the loader
would block analytics/marketing scripts and the scan would only see
strictly-necessary cookies, which tells you nothing about what the
site actually loads in the post-consent state. Pre-consent compliance
checks live in ``consent_validator.py`` and use a separate code path.
""" """
from __future__ import annotations from __future__ import annotations
import json
import logging import logging
import time
import uuid
from dataclasses import dataclass, field from dataclasses import dataclass, field
from urllib.parse import urlparse from datetime import UTC, datetime
from urllib.parse import quote, urlparse
from playwright.async_api import ( from playwright.async_api import (
BrowserContext, BrowserContext,
@@ -22,6 +34,50 @@ from playwright.async_api import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# All ConsentOS categories — pre-seeded as accepted on every crawl so
# the loader's "consent already given" branch fires and unblocks all
# scripts/cookies.
_ALL_CATEGORIES: list[str] = [
"necessary",
"functional",
"analytics",
"marketing",
"personalisation",
]
# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
# rename it there, rename it here too.
_CONSENT_COOKIE_NAME = "_consentos_consent"
def _build_consent_cookie(url: str) -> dict:
"""Return a Playwright cookie dict pre-seeding ConsentOS consent.
Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
``readConsent`` returns a valid object and short-circuits straight
to ``updateAcceptedCategories(...)``. Categories are hard-coded to
every known ConsentOS category; the scanner is a "what does this
site load when the visitor accepts everything?" tool, by design.
"""
state = {
"visitorId": str(uuid.uuid4()),
"accepted": _ALL_CATEGORIES,
"rejected": [],
"consentedAt": datetime.now(UTC).isoformat(),
"bannerVersion": "scanner",
}
value = quote(json.dumps(state, separators=(",", ":")), safe="")
return {
"name": _CONSENT_COOKIE_NAME,
"value": value,
"url": url,
"path": "/",
"expires": time.time() + 365 * 86400,
"sameSite": "Lax",
}
# Realistic Chrome UA so sites don't block the crawler as a bot. # Realistic Chrome UA so sites don't block the crawler as a bot.
_DEFAULT_USER_AGENT = ( _DEFAULT_USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -152,8 +208,13 @@ class CookieCrawler:
user_agent=self._user_agent, user_agent=self._user_agent,
ignore_https_errors=True, ignore_https_errors=True,
) )
# Clear all cookies before visiting # Start from a clean slate, then plant the ConsentOS consent
# cookie so the loader treats the visitor as having already
# accepted every category. Without this the scan only sees
# strictly-necessary cookies — useless for "what does this
# site actually load?" reporting.
await context.clear_cookies() await context.clear_cookies()
await context.add_cookies([_build_consent_cookie(url)])
page: Page = await context.new_page() page: Page = await context.new_page()

View File

@@ -8,10 +8,13 @@ from unittest.mock import AsyncMock, MagicMock, patch
import pytest import pytest
from src.crawler import ( from src.crawler import (
_ALL_CATEGORIES,
_CONSENT_COOKIE_NAME,
CookieCrawler, CookieCrawler,
CrawlResult, CrawlResult,
DiscoveredCookie, DiscoveredCookie,
SiteCrawlResult, SiteCrawlResult,
_build_consent_cookie,
_build_initiator_chain, _build_initiator_chain,
_get_script_initiator, _get_script_initiator,
) )
@@ -438,3 +441,85 @@ class TestCrawlSite:
await crawler.crawl_site(["https://example.com/"]) await crawler.crawl_site(["https://example.com/"])
browser.close.assert_awaited_once() browser.close.assert_awaited_once()
# ── Consent pre-seed ────────────────────────────────────────────────────
class TestBuildConsentCookie:
"""The pre-seeded ``_consentos_consent`` cookie."""
def test_cookie_name_matches_loader(self):
cookie = _build_consent_cookie("https://example.com/")
assert cookie["name"] == _CONSENT_COOKIE_NAME == "_consentos_consent"
def test_cookie_is_url_scoped_for_playwright(self):
"""``url`` lets Playwright derive domain / path / secure."""
cookie = _build_consent_cookie("https://example.com/page")
assert cookie["url"] == "https://example.com/page"
assert cookie["path"] == "/"
def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
import json as _json
from urllib.parse import unquote
cookie = _build_consent_cookie("https://example.com/")
state = _json.loads(unquote(cookie["value"]))
assert sorted(state["accepted"]) == sorted(_ALL_CATEGORIES)
assert state["rejected"] == []
# ConsentState fields the loader's readConsent() relies on
assert "visitorId" in state
assert "consentedAt" in state
assert "bannerVersion" in state
def test_cookie_expires_far_in_future(self):
import time as _time
cookie = _build_consent_cookie("https://example.com/")
# ~1 year, allow generous slack for test timing
assert cookie["expires"] > _time.time() + 300 * 86400
@pytest.mark.asyncio(loop_scope="session")
@patch("src.crawler.async_playwright")
async def test_crawl_seeds_consent_before_navigation(self, mock_pw):
"""``add_cookies`` must be called before ``page.goto``."""
page = _make_mock_page()
context = _make_mock_context(page)
browser = _make_mock_browser(context)
# Track call order on the context
call_order: list[str] = []
original_add = context.add_cookies
original_clear = context.clear_cookies
async def _add(*args, **kwargs):
call_order.append("add_cookies")
return await original_add(*args, **kwargs)
async def _clear(*args, **kwargs):
call_order.append("clear_cookies")
return await original_clear(*args, **kwargs)
async def _goto(*args, **kwargs):
call_order.append("goto")
context.add_cookies = AsyncMock(side_effect=_add)
context.clear_cookies = AsyncMock(side_effect=_clear)
page.goto = AsyncMock(side_effect=_goto)
pw_instance = AsyncMock()
pw_instance.chromium.launch = AsyncMock(return_value=browser)
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
crawler = CookieCrawler()
await crawler.crawl_site(["https://example.com/"])
assert call_order == ["clear_cookies", "add_cookies", "goto"], call_order
# And the cookie payload was the one we expect
seeded = context.add_cookies.call_args.args[0]
assert len(seeded) == 1
assert seeded[0]["name"] == "_consentos_consent"
assert seeded[0]["url"] == "https://example.com/"