fix(scanner): pre-seed ConsentOS consent so crawls see post-consent state (#2)
* fix(scanner): pre-seed accepted ConsentOS consent before crawling A site running ConsentOS exposes one set of cookies before consent (strictly necessary only) and a much larger set after the visitor accepts analytics/marketing/personalisation. The scanner is meant to answer "what does this site actually load?" — but because the crawler clears cookies and navigates without ever interacting with the banner, every scan returned the pre-consent view. Useful for spotting trackers that fire before consent (which is what ``consent_validator.py`` does), useless for the cookie inventory the admin UI exists to display. Plant ``_consentos_consent`` on the browser context with all categories accepted before ``page.goto``. The cookie payload mirrors ``apps/banner/src/consent.ts:writeConsent`` exactly (URL-encoded ``ConsentState`` JSON, ``Lax`` SameSite, year-long expiry) so the loader's ``readConsent`` short-circuits straight to ``updateAcceptedCategories(['necessary','functional','analytics', 'marketing','personalisation'])`` — the blocker is bypassed and the crawl sees what the visitor would see. Pre-consent compliance checks live in ``consent_validator.py`` and use a separate code path; this change only touches the cookie inventory crawl. * style: ruff format crawler.py
This commit is contained in:
@@ -1,16 +1,28 @@
|
|||||||
"""Playwright-based headless browser cookie crawler.
|
"""Playwright-based headless browser cookie crawler.
|
||||||
|
|
||||||
For each URL: launches headless Chromium, clears cookies, navigates,
|
For each URL: launches headless Chromium, **pre-seeds an
|
||||||
waits for network idle, enumerates document.cookie / localStorage /
|
"all categories accepted" ConsentOS consent cookie**, clears any other
|
||||||
sessionStorage, captures Set-Cookie headers from network requests,
|
cookies, navigates, waits for network idle, enumerates
|
||||||
and attributes cookies to source scripts via the request chain.
|
``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
|
||||||
|
``Set-Cookie`` headers from network requests, and attributes cookies
|
||||||
|
to source scripts via the request chain.
|
||||||
|
|
||||||
|
The pre-seed is what makes the scan useful: without it the loader
|
||||||
|
would block analytics/marketing scripts and the scan would only see
|
||||||
|
strictly-necessary cookies, which tells you nothing about what the
|
||||||
|
site actually loads in the post-consent state. Pre-consent compliance
|
||||||
|
checks live in ``consent_validator.py`` and use a separate code path.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from urllib.parse import urlparse
|
from datetime import UTC, datetime
|
||||||
|
from urllib.parse import quote, urlparse
|
||||||
|
|
||||||
from playwright.async_api import (
|
from playwright.async_api import (
|
||||||
BrowserContext,
|
BrowserContext,
|
||||||
@@ -22,6 +34,50 @@ from playwright.async_api import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# All ConsentOS categories — pre-seeded as accepted on every crawl so
|
||||||
|
# the loader's "consent already given" branch fires and unblocks all
|
||||||
|
# scripts/cookies.
|
||||||
|
_ALL_CATEGORIES: list[str] = [
|
||||||
|
"necessary",
|
||||||
|
"functional",
|
||||||
|
"analytics",
|
||||||
|
"marketing",
|
||||||
|
"personalisation",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
|
||||||
|
# rename it there, rename it here too.
|
||||||
|
_CONSENT_COOKIE_NAME = "_consentos_consent"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_consent_cookie(url: str) -> dict:
|
||||||
|
"""Return a Playwright cookie dict pre-seeding ConsentOS consent.
|
||||||
|
|
||||||
|
Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
|
||||||
|
produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
|
||||||
|
``readConsent`` returns a valid object and short-circuits straight
|
||||||
|
to ``updateAcceptedCategories(...)``. Categories are hard-coded to
|
||||||
|
every known ConsentOS category; the scanner is a "what does this
|
||||||
|
site load when the visitor accepts everything?" tool, by design.
|
||||||
|
"""
|
||||||
|
state = {
|
||||||
|
"visitorId": str(uuid.uuid4()),
|
||||||
|
"accepted": _ALL_CATEGORIES,
|
||||||
|
"rejected": [],
|
||||||
|
"consentedAt": datetime.now(UTC).isoformat(),
|
||||||
|
"bannerVersion": "scanner",
|
||||||
|
}
|
||||||
|
value = quote(json.dumps(state, separators=(",", ":")), safe="")
|
||||||
|
return {
|
||||||
|
"name": _CONSENT_COOKIE_NAME,
|
||||||
|
"value": value,
|
||||||
|
"url": url,
|
||||||
|
"path": "/",
|
||||||
|
"expires": time.time() + 365 * 86400,
|
||||||
|
"sameSite": "Lax",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# Realistic Chrome UA so sites don't block the crawler as a bot.
|
# Realistic Chrome UA so sites don't block the crawler as a bot.
|
||||||
_DEFAULT_USER_AGENT = (
|
_DEFAULT_USER_AGENT = (
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
@@ -152,8 +208,13 @@ class CookieCrawler:
|
|||||||
user_agent=self._user_agent,
|
user_agent=self._user_agent,
|
||||||
ignore_https_errors=True,
|
ignore_https_errors=True,
|
||||||
)
|
)
|
||||||
# Clear all cookies before visiting
|
# Start from a clean slate, then plant the ConsentOS consent
|
||||||
|
# cookie so the loader treats the visitor as having already
|
||||||
|
# accepted every category. Without this the scan only sees
|
||||||
|
# strictly-necessary cookies — useless for "what does this
|
||||||
|
# site actually load?" reporting.
|
||||||
await context.clear_cookies()
|
await context.clear_cookies()
|
||||||
|
await context.add_cookies([_build_consent_cookie(url)])
|
||||||
|
|
||||||
page: Page = await context.new_page()
|
page: Page = await context.new_page()
|
||||||
|
|
||||||
|
|||||||
@@ -8,10 +8,13 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from src.crawler import (
|
from src.crawler import (
|
||||||
|
_ALL_CATEGORIES,
|
||||||
|
_CONSENT_COOKIE_NAME,
|
||||||
CookieCrawler,
|
CookieCrawler,
|
||||||
CrawlResult,
|
CrawlResult,
|
||||||
DiscoveredCookie,
|
DiscoveredCookie,
|
||||||
SiteCrawlResult,
|
SiteCrawlResult,
|
||||||
|
_build_consent_cookie,
|
||||||
_build_initiator_chain,
|
_build_initiator_chain,
|
||||||
_get_script_initiator,
|
_get_script_initiator,
|
||||||
)
|
)
|
||||||
@@ -438,3 +441,85 @@ class TestCrawlSite:
|
|||||||
await crawler.crawl_site(["https://example.com/"])
|
await crawler.crawl_site(["https://example.com/"])
|
||||||
|
|
||||||
browser.close.assert_awaited_once()
|
browser.close.assert_awaited_once()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Consent pre-seed ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildConsentCookie:
|
||||||
|
"""The pre-seeded ``_consentos_consent`` cookie."""
|
||||||
|
|
||||||
|
def test_cookie_name_matches_loader(self):
|
||||||
|
cookie = _build_consent_cookie("https://example.com/")
|
||||||
|
assert cookie["name"] == _CONSENT_COOKIE_NAME == "_consentos_consent"
|
||||||
|
|
||||||
|
def test_cookie_is_url_scoped_for_playwright(self):
|
||||||
|
"""``url`` lets Playwright derive domain / path / secure."""
|
||||||
|
cookie = _build_consent_cookie("https://example.com/page")
|
||||||
|
assert cookie["url"] == "https://example.com/page"
|
||||||
|
assert cookie["path"] == "/"
|
||||||
|
|
||||||
|
def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
|
||||||
|
import json as _json
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
cookie = _build_consent_cookie("https://example.com/")
|
||||||
|
state = _json.loads(unquote(cookie["value"]))
|
||||||
|
|
||||||
|
assert sorted(state["accepted"]) == sorted(_ALL_CATEGORIES)
|
||||||
|
assert state["rejected"] == []
|
||||||
|
# ConsentState fields the loader's readConsent() relies on
|
||||||
|
assert "visitorId" in state
|
||||||
|
assert "consentedAt" in state
|
||||||
|
assert "bannerVersion" in state
|
||||||
|
|
||||||
|
def test_cookie_expires_far_in_future(self):
|
||||||
|
import time as _time
|
||||||
|
|
||||||
|
cookie = _build_consent_cookie("https://example.com/")
|
||||||
|
# ~1 year, allow generous slack for test timing
|
||||||
|
assert cookie["expires"] > _time.time() + 300 * 86400
|
||||||
|
|
||||||
|
@pytest.mark.asyncio(loop_scope="session")
|
||||||
|
@patch("src.crawler.async_playwright")
|
||||||
|
async def test_crawl_seeds_consent_before_navigation(self, mock_pw):
|
||||||
|
"""``add_cookies`` must be called before ``page.goto``."""
|
||||||
|
page = _make_mock_page()
|
||||||
|
context = _make_mock_context(page)
|
||||||
|
browser = _make_mock_browser(context)
|
||||||
|
|
||||||
|
# Track call order on the context
|
||||||
|
call_order: list[str] = []
|
||||||
|
original_add = context.add_cookies
|
||||||
|
original_clear = context.clear_cookies
|
||||||
|
|
||||||
|
async def _add(*args, **kwargs):
|
||||||
|
call_order.append("add_cookies")
|
||||||
|
return await original_add(*args, **kwargs)
|
||||||
|
|
||||||
|
async def _clear(*args, **kwargs):
|
||||||
|
call_order.append("clear_cookies")
|
||||||
|
return await original_clear(*args, **kwargs)
|
||||||
|
|
||||||
|
async def _goto(*args, **kwargs):
|
||||||
|
call_order.append("goto")
|
||||||
|
|
||||||
|
context.add_cookies = AsyncMock(side_effect=_add)
|
||||||
|
context.clear_cookies = AsyncMock(side_effect=_clear)
|
||||||
|
page.goto = AsyncMock(side_effect=_goto)
|
||||||
|
|
||||||
|
pw_instance = AsyncMock()
|
||||||
|
pw_instance.chromium.launch = AsyncMock(return_value=browser)
|
||||||
|
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
|
||||||
|
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
|
||||||
|
crawler = CookieCrawler()
|
||||||
|
await crawler.crawl_site(["https://example.com/"])
|
||||||
|
|
||||||
|
assert call_order == ["clear_cookies", "add_cookies", "goto"], call_order
|
||||||
|
|
||||||
|
# And the cookie payload was the one we expect
|
||||||
|
seeded = context.add_cookies.call_args.args[0]
|
||||||
|
assert len(seeded) == 1
|
||||||
|
assert seeded[0]["name"] == "_consentos_consent"
|
||||||
|
assert seeded[0]["url"] == "https://example.com/"
|
||||||
|
|||||||
Reference in New Issue
Block a user