Files
consentos/apps/scanner/src/crawler.py
James Cottrill e0f1dd43e8 fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)
Scanner fixes:
- Remove conflicting ``path`` from consent pre-seed cookie (Playwright
  rejects cookies with both ``url`` and ``path``).
- Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable
  cookie capture.
- Check sitemap Content-Type to skip SPA HTML fallbacks.
- Propagate ``auto_category`` from scan results to the cookies table
  during sync (was silently dropped).
- Add ``_gcl_ls`` to the Open Cookie Database CSV.
- Classify ``_consentos_*`` cookies as necessary directly in the
  classification engine.
- Add ``seed_known_cookies`` to the bootstrap init container command.

Admin UI:
- Add scan schedule control to the Scans tab — preset options
  (disabled/daily/weekly/fortnightly/monthly) plus custom cron input.
  Saves ``scan_schedule_cron`` on the site config.
2026-04-18 20:14:32 +01:00

453 lines
17 KiB
Python

"""Playwright-based headless browser cookie crawler.
For each URL: launches headless Chromium, **pre-seeds an
"all categories accepted" ConsentOS consent cookie**, clears any other
cookies, navigates, waits for network idle, enumerates
``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
``Set-Cookie`` headers from network requests, and attributes cookies
to source scripts via the request chain.
The pre-seed is what makes the scan useful: without it the loader
would block analytics/marketing scripts and the scan would only see
strictly-necessary cookies, which tells you nothing about what the
site actually loads in the post-consent state. Pre-consent compliance
checks live in ``consent_validator.py`` and use a separate code path.
"""
from __future__ import annotations
import json
import logging
import time
import uuid
from dataclasses import dataclass, field
from datetime import UTC, datetime
from urllib.parse import quote, urlparse
from playwright.async_api import (
BrowserContext,
Page,
Request,
Response,
async_playwright,
)
logger = logging.getLogger(__name__)
# All ConsentOS categories — pre-seeded as accepted on every crawl so
# the loader's "consent already given" branch fires and unblocks all
# scripts/cookies.
_ALL_CATEGORIES: list[str] = [
"necessary",
"functional",
"analytics",
"marketing",
"personalisation",
]
# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
# rename it there, rename it here too.
_CONSENT_COOKIE_NAME = "_consentos_consent"
def _build_consent_cookie(url: str) -> dict:
"""Return a Playwright cookie dict pre-seeding ConsentOS consent.
Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
``readConsent`` returns a valid object and short-circuits straight
to ``updateAcceptedCategories(...)``. Categories are hard-coded to
every known ConsentOS category; the scanner is a "what does this
site load when the visitor accepts everything?" tool, by design.
"""
state = {
"visitorId": str(uuid.uuid4()),
"accepted": _ALL_CATEGORIES,
"rejected": [],
"consentedAt": datetime.now(UTC).isoformat(),
"bannerVersion": "scanner",
}
value = quote(json.dumps(state, separators=(",", ":")), safe="")
# Playwright's ``add_cookies`` accepts EITHER ``url`` (from which
# it derives domain/path/secure) OR explicit ``domain`` + ``path``
# — but not both. Using ``url`` is simplest.
return {
"name": _CONSENT_COOKIE_NAME,
"value": value,
"url": url,
"expires": time.time() + 365 * 86400,
"sameSite": "Lax",
}
# Realistic Chrome UA so sites don't block the crawler as a bot.
_DEFAULT_USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
@dataclass
class DiscoveredCookie:
"""A cookie or storage item found during a crawl."""
name: str
domain: str
storage_type: str = "cookie" # cookie | local_storage | session_storage
path: str | None = None
expires: float | None = None
http_only: bool | None = None
secure: bool | None = None
same_site: str | None = None
value_length: int = 0
script_source: str | None = None
page_url: str = ""
initiator_chain: list[str] = field(default_factory=list)
@dataclass
class CrawlResult:
"""Result of crawling a single page."""
url: str
cookies: list[DiscoveredCookie] = field(default_factory=list)
error: str | None = None
@dataclass
class SiteCrawlResult:
"""Aggregated result of crawling all pages on a site."""
domain: str
pages: list[CrawlResult] = field(default_factory=list)
total_cookies_found: int = 0
@property
def unique_cookies(self) -> list[DiscoveredCookie]:
"""Deduplicate cookies across pages by (name, domain, storage_type)."""
seen: dict[tuple[str, str, str], DiscoveredCookie] = {}
for page in self.pages:
for cookie in page.cookies:
key = (cookie.name, cookie.domain, cookie.storage_type)
if key not in seen:
seen[key] = cookie
return list(seen.values())
@dataclass
class ProxyConfig:
"""Proxy configuration for geo-located scanning."""
server: str # e.g. "http://proxy-eu.example.com:8080"
username: str | None = None
password: str | None = None
class CookieCrawler:
"""Crawls a site using Playwright to discover cookies and storage items."""
def __init__(
self,
*,
headless: bool = True,
timeout_ms: int = 30_000,
user_agent: str = _DEFAULT_USER_AGENT,
proxy: ProxyConfig | None = None,
) -> None:
self._headless = headless
self._timeout_ms = timeout_ms
self._user_agent = user_agent
self._proxy = proxy
async def crawl_site(
self,
urls: list[str],
*,
max_pages: int = 50,
) -> SiteCrawlResult:
"""Crawl multiple URLs and aggregate cookie discoveries."""
if not urls:
return SiteCrawlResult(domain="")
domain = urlparse(urls[0]).hostname or ""
result = SiteCrawlResult(domain=domain)
async with async_playwright() as pw:
launch_kwargs: dict = {"headless": self._headless}
if self._proxy:
proxy_opts: dict = {"server": self._proxy.server}
if self._proxy.username:
proxy_opts["username"] = self._proxy.username
if self._proxy.password:
proxy_opts["password"] = self._proxy.password
launch_kwargs["proxy"] = proxy_opts
browser = await pw.chromium.launch(**launch_kwargs)
try:
for url in urls[:max_pages]:
page_result = await self._crawl_page(browser, url)
result.pages.append(page_result)
result.total_cookies_found += len(page_result.cookies)
finally:
await browser.close()
return result
async def _crawl_page(
self,
browser: Browser, # noqa: F821
url: str,
) -> CrawlResult:
"""Crawl a single page and discover cookies."""
result = CrawlResult(url=url)
script_cookies: dict[str, str] = {} # cookie name → script URL
initiator_map: dict[str, str] = {} # request URL → initiating URL
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
# Cookies discovered directly from Set-Cookie response headers.
# Keyed by (name, domain) so they can be merged with CDP results.
header_cookies: dict[tuple[str, str], DiscoveredCookie] = {}
context: BrowserContext | None = None
try:
context = await browser.new_context(
user_agent=self._user_agent,
ignore_https_errors=True,
)
# Start from a clean slate, then plant the ConsentOS consent
# cookie so the loader treats the visitor as having already
# accepted every category. Without this the scan only sees
# strictly-necessary cookies — useless for "what does this
# site actually load?" reporting.
await context.clear_cookies()
await context.add_cookies([_build_consent_cookie(url)])
page: Page = await context.new_page()
# Track request initiator chains via frame URL and redirect chains
def _on_request(request: Request) -> None:
try:
req_url = request.url
# Follow redirect chain to find the original initiator
redirected = request.redirected_from
if redirected:
initiator_map[req_url] = redirected.url
else:
# Use the frame URL as the parent initiator
frame_url = request.frame.url if request.frame else ""
if frame_url and frame_url != req_url:
initiator_map[req_url] = frame_url
except Exception:
pass # Non-critical — request introspection may fail
page.on("request", _on_request)
# Track Set-Cookie headers from responses and create
# DiscoveredCookie entries directly — CDP's context.cookies()
# may not enumerate cross-domain cookies.
async def _on_response(response: Response) -> None:
try:
headers = await response.all_headers()
set_cookie = headers.get("set-cookie", "")
if set_cookie:
# Attribute cookie to the initiating script
request: Request = response.request
initiator = _get_script_initiator(request)
# Build the initiator chain for this request
chain = _build_initiator_chain(request.url, initiator_map)
resp_domain = urlparse(response.url).hostname or ""
for cookie_str in set_cookie.split("\n"):
name = cookie_str.split("=")[0].strip()
if name:
if initiator:
script_cookies[name] = initiator
initiator_chains[name] = chain
# Parse optional Domain attribute from
# the Set-Cookie header; fall back to
# the response hostname.
domain = resp_domain
for part in cookie_str.split(";")[1:]:
part = part.strip()
if part.lower().startswith("domain="):
domain = part.split("=", 1)[1].strip()
break
key = (name, domain)
if key not in header_cookies:
header_cookies[key] = DiscoveredCookie(
name=name,
domain=domain,
storage_type="cookie",
script_source=initiator,
page_url=url,
initiator_chain=chain,
)
except Exception:
pass # Non-critical — response may have been aborted
page.on("response", _on_response)
# Navigate — networkidle waits until ≤2 active connections for
# 500ms, which catches the GA beacon round-trip that
# domcontentloaded misses.
await page.goto(url, wait_until="networkidle", timeout=self._timeout_ms)
# Safety margin for late-firing scripts (e.g. deferred GTM tags).
await page.wait_for_timeout(5000)
# First pass — enumerate browser cookies via CDP.
cdp_cookies = await context.cookies()
# Second pass — wait a further 2 seconds for any delayed
# Set-Cookie headers, then merge newly appeared cookies.
await page.wait_for_timeout(2000)
delayed_cookies = await context.cookies()
# Merge: index first-pass cookies by (name, domain), then
# add any that only appeared in the second pass.
seen_keys: set[tuple[str, str]] = set()
all_cdp_cookies: list[dict] = []
for c in cdp_cookies:
key = (c["name"], c["domain"])
seen_keys.add(key)
all_cdp_cookies.append(c)
for c in delayed_cookies:
key = (c["name"], c["domain"])
if key not in seen_keys:
seen_keys.add(key)
all_cdp_cookies.append(c)
for c in all_cdp_cookies:
result.cookies.append(
DiscoveredCookie(
name=c["name"],
domain=c["domain"],
storage_type="cookie",
path=c.get("path"),
expires=c.get("expires"),
http_only=c.get("httpOnly"),
secure=c.get("secure"),
same_site=c.get("sameSite"),
value_length=len(c.get("value", "")),
script_source=script_cookies.get(c["name"]),
page_url=url,
initiator_chain=initiator_chains.get(c["name"], []),
)
)
# Merge cookies seen in Set-Cookie headers but NOT in the
# CDP cookie jar (e.g. cross-domain cookies that the browser
# scoped to a different origin).
for key, hc in header_cookies.items():
if key not in seen_keys:
result.cookies.append(hc)
# Enumerate localStorage
ls_items = await page.evaluate("""() => {
const items = [];
try {
for (let i = 0; i < localStorage.length; i++) {
const key = localStorage.key(i);
if (key) {
items.push({
name: key,
valueLength: (localStorage.getItem(key) || '').length,
});
}
}
} catch (e) {}
return items;
}""")
hostname = urlparse(url).hostname or ""
for item in ls_items:
result.cookies.append(
DiscoveredCookie(
name=item["name"],
domain=hostname,
storage_type="local_storage",
value_length=item["valueLength"],
page_url=url,
)
)
# Enumerate sessionStorage
ss_items = await page.evaluate("""() => {
const items = [];
try {
for (let i = 0; i < sessionStorage.length; i++) {
const key = sessionStorage.key(i);
if (key) {
items.push({
name: key,
valueLength: (sessionStorage.getItem(key) || '').length,
});
}
}
} catch (e) {}
return items;
}""")
for item in ss_items:
result.cookies.append(
DiscoveredCookie(
name=item["name"],
domain=hostname,
storage_type="session_storage",
value_length=item["valueLength"],
page_url=url,
)
)
except Exception as exc:
result.error = str(exc)
logger.warning("Failed to crawl %s: %s", url, exc)
finally:
if context:
await context.close()
return result
def _get_script_initiator(request: Request) -> str | None:
"""Walk the request chain to find the originating script URL.
Returns a single script URL for backwards compatibility. For the full
initiator path, use :func:`_build_initiator_chain` instead.
"""
seen: set[str] = set()
current = request
while current:
url = current.url
if url in seen:
break
seen.add(url)
if url.endswith(".js") or "javascript" in (current.resource_type or ""):
return url
redirected = current.redirected_from
if redirected:
current = redirected
else:
break
return None
def _build_initiator_chain(
url: str,
initiator_map: dict[str, str],
max_depth: int = 20,
) -> list[str]:
"""Build the full initiator chain from a URL back to the root.
Walks the initiator map from *url* towards the top-level page,
producing a list ordered root-first (i.e. the page URL at index 0
and the leaf request URL at the end).
"""
chain = [url]
seen: set[str] = {url}
current = url
for _ in range(max_depth):
parent = initiator_map.get(current, "")
if not parent or parent in seen:
break
chain.append(parent)
seen.add(parent)
current = parent
chain.reverse() # Root first
return chain