Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
453 lines
17 KiB
Python
453 lines
17 KiB
Python
"""Playwright-based headless browser cookie crawler.
|
|
|
|
For each URL: launches headless Chromium, **pre-seeds an
|
|
"all categories accepted" ConsentOS consent cookie**, clears any other
|
|
cookies, navigates, waits for network idle, enumerates
|
|
``document.cookie`` / ``localStorage`` / ``sessionStorage``, captures
|
|
``Set-Cookie`` headers from network requests, and attributes cookies
|
|
to source scripts via the request chain.
|
|
|
|
The pre-seed is what makes the scan useful: without it the loader
|
|
would block analytics/marketing scripts and the scan would only see
|
|
strictly-necessary cookies, which tells you nothing about what the
|
|
site actually loads in the post-consent state. Pre-consent compliance
|
|
checks live in ``consent_validator.py`` and use a separate code path.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from urllib.parse import quote, urlparse
|
|
|
|
from playwright.async_api import (
|
|
BrowserContext,
|
|
Page,
|
|
Request,
|
|
Response,
|
|
async_playwright,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# All ConsentOS categories — pre-seeded as accepted on every crawl so
|
|
# the loader's "consent already given" branch fires and unblocks all
|
|
# scripts/cookies.
|
|
_ALL_CATEGORIES: list[str] = [
|
|
"necessary",
|
|
"functional",
|
|
"analytics",
|
|
"marketing",
|
|
"personalisation",
|
|
]
|
|
|
|
# Must match ``COOKIE_NAME`` in apps/banner/src/consent.ts. If you
|
|
# rename it there, rename it here too.
|
|
_CONSENT_COOKIE_NAME = "_consentos_consent"
|
|
|
|
|
|
def _build_consent_cookie(url: str) -> dict:
|
|
"""Return a Playwright cookie dict pre-seeding ConsentOS consent.
|
|
|
|
Mirrors the shape that ``apps/banner/src/consent.ts:writeConsent``
|
|
produces — URL-encoded JSON of a ``ConsentState`` — so the loader's
|
|
``readConsent`` returns a valid object and short-circuits straight
|
|
to ``updateAcceptedCategories(...)``. Categories are hard-coded to
|
|
every known ConsentOS category; the scanner is a "what does this
|
|
site load when the visitor accepts everything?" tool, by design.
|
|
"""
|
|
state = {
|
|
"visitorId": str(uuid.uuid4()),
|
|
"accepted": _ALL_CATEGORIES,
|
|
"rejected": [],
|
|
"consentedAt": datetime.now(UTC).isoformat(),
|
|
"bannerVersion": "scanner",
|
|
}
|
|
value = quote(json.dumps(state, separators=(",", ":")), safe="")
|
|
# Playwright's ``add_cookies`` accepts EITHER ``url`` (from which
|
|
# it derives domain/path/secure) OR explicit ``domain`` + ``path``
|
|
# — but not both. Using ``url`` is simplest.
|
|
return {
|
|
"name": _CONSENT_COOKIE_NAME,
|
|
"value": value,
|
|
"url": url,
|
|
"expires": time.time() + 365 * 86400,
|
|
"sameSite": "Lax",
|
|
}
|
|
|
|
|
|
# Realistic Chrome UA so sites don't block the crawler as a bot.
|
|
_DEFAULT_USER_AGENT = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/131.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class DiscoveredCookie:
|
|
"""A cookie or storage item found during a crawl."""
|
|
|
|
name: str
|
|
domain: str
|
|
storage_type: str = "cookie" # cookie | local_storage | session_storage
|
|
path: str | None = None
|
|
expires: float | None = None
|
|
http_only: bool | None = None
|
|
secure: bool | None = None
|
|
same_site: str | None = None
|
|
value_length: int = 0
|
|
script_source: str | None = None
|
|
page_url: str = ""
|
|
initiator_chain: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class CrawlResult:
|
|
"""Result of crawling a single page."""
|
|
|
|
url: str
|
|
cookies: list[DiscoveredCookie] = field(default_factory=list)
|
|
error: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class SiteCrawlResult:
|
|
"""Aggregated result of crawling all pages on a site."""
|
|
|
|
domain: str
|
|
pages: list[CrawlResult] = field(default_factory=list)
|
|
total_cookies_found: int = 0
|
|
|
|
@property
|
|
def unique_cookies(self) -> list[DiscoveredCookie]:
|
|
"""Deduplicate cookies across pages by (name, domain, storage_type)."""
|
|
seen: dict[tuple[str, str, str], DiscoveredCookie] = {}
|
|
for page in self.pages:
|
|
for cookie in page.cookies:
|
|
key = (cookie.name, cookie.domain, cookie.storage_type)
|
|
if key not in seen:
|
|
seen[key] = cookie
|
|
return list(seen.values())
|
|
|
|
|
|
@dataclass
|
|
class ProxyConfig:
|
|
"""Proxy configuration for geo-located scanning."""
|
|
|
|
server: str # e.g. "http://proxy-eu.example.com:8080"
|
|
username: str | None = None
|
|
password: str | None = None
|
|
|
|
|
|
class CookieCrawler:
|
|
"""Crawls a site using Playwright to discover cookies and storage items."""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
headless: bool = True,
|
|
timeout_ms: int = 30_000,
|
|
user_agent: str = _DEFAULT_USER_AGENT,
|
|
proxy: ProxyConfig | None = None,
|
|
) -> None:
|
|
self._headless = headless
|
|
self._timeout_ms = timeout_ms
|
|
self._user_agent = user_agent
|
|
self._proxy = proxy
|
|
|
|
async def crawl_site(
|
|
self,
|
|
urls: list[str],
|
|
*,
|
|
max_pages: int = 50,
|
|
) -> SiteCrawlResult:
|
|
"""Crawl multiple URLs and aggregate cookie discoveries."""
|
|
if not urls:
|
|
return SiteCrawlResult(domain="")
|
|
|
|
domain = urlparse(urls[0]).hostname or ""
|
|
result = SiteCrawlResult(domain=domain)
|
|
|
|
async with async_playwright() as pw:
|
|
launch_kwargs: dict = {"headless": self._headless}
|
|
if self._proxy:
|
|
proxy_opts: dict = {"server": self._proxy.server}
|
|
if self._proxy.username:
|
|
proxy_opts["username"] = self._proxy.username
|
|
if self._proxy.password:
|
|
proxy_opts["password"] = self._proxy.password
|
|
launch_kwargs["proxy"] = proxy_opts
|
|
browser = await pw.chromium.launch(**launch_kwargs)
|
|
try:
|
|
for url in urls[:max_pages]:
|
|
page_result = await self._crawl_page(browser, url)
|
|
result.pages.append(page_result)
|
|
result.total_cookies_found += len(page_result.cookies)
|
|
finally:
|
|
await browser.close()
|
|
|
|
return result
|
|
|
|
async def _crawl_page(
|
|
self,
|
|
browser: Browser, # noqa: F821
|
|
url: str,
|
|
) -> CrawlResult:
|
|
"""Crawl a single page and discover cookies."""
|
|
result = CrawlResult(url=url)
|
|
script_cookies: dict[str, str] = {} # cookie name → script URL
|
|
initiator_map: dict[str, str] = {} # request URL → initiating URL
|
|
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
|
|
# Cookies discovered directly from Set-Cookie response headers.
|
|
# Keyed by (name, domain) so they can be merged with CDP results.
|
|
header_cookies: dict[tuple[str, str], DiscoveredCookie] = {}
|
|
|
|
context: BrowserContext | None = None
|
|
try:
|
|
context = await browser.new_context(
|
|
user_agent=self._user_agent,
|
|
ignore_https_errors=True,
|
|
)
|
|
# Start from a clean slate, then plant the ConsentOS consent
|
|
# cookie so the loader treats the visitor as having already
|
|
# accepted every category. Without this the scan only sees
|
|
# strictly-necessary cookies — useless for "what does this
|
|
# site actually load?" reporting.
|
|
await context.clear_cookies()
|
|
await context.add_cookies([_build_consent_cookie(url)])
|
|
|
|
page: Page = await context.new_page()
|
|
|
|
# Track request initiator chains via frame URL and redirect chains
|
|
def _on_request(request: Request) -> None:
|
|
try:
|
|
req_url = request.url
|
|
# Follow redirect chain to find the original initiator
|
|
redirected = request.redirected_from
|
|
if redirected:
|
|
initiator_map[req_url] = redirected.url
|
|
else:
|
|
# Use the frame URL as the parent initiator
|
|
frame_url = request.frame.url if request.frame else ""
|
|
if frame_url and frame_url != req_url:
|
|
initiator_map[req_url] = frame_url
|
|
except Exception:
|
|
pass # Non-critical — request introspection may fail
|
|
|
|
page.on("request", _on_request)
|
|
|
|
# Track Set-Cookie headers from responses and create
|
|
# DiscoveredCookie entries directly — CDP's context.cookies()
|
|
# may not enumerate cross-domain cookies.
|
|
async def _on_response(response: Response) -> None:
|
|
try:
|
|
headers = await response.all_headers()
|
|
set_cookie = headers.get("set-cookie", "")
|
|
if set_cookie:
|
|
# Attribute cookie to the initiating script
|
|
request: Request = response.request
|
|
initiator = _get_script_initiator(request)
|
|
# Build the initiator chain for this request
|
|
chain = _build_initiator_chain(request.url, initiator_map)
|
|
resp_domain = urlparse(response.url).hostname or ""
|
|
for cookie_str in set_cookie.split("\n"):
|
|
name = cookie_str.split("=")[0].strip()
|
|
if name:
|
|
if initiator:
|
|
script_cookies[name] = initiator
|
|
initiator_chains[name] = chain
|
|
# Parse optional Domain attribute from
|
|
# the Set-Cookie header; fall back to
|
|
# the response hostname.
|
|
domain = resp_domain
|
|
for part in cookie_str.split(";")[1:]:
|
|
part = part.strip()
|
|
if part.lower().startswith("domain="):
|
|
domain = part.split("=", 1)[1].strip()
|
|
break
|
|
key = (name, domain)
|
|
if key not in header_cookies:
|
|
header_cookies[key] = DiscoveredCookie(
|
|
name=name,
|
|
domain=domain,
|
|
storage_type="cookie",
|
|
script_source=initiator,
|
|
page_url=url,
|
|
initiator_chain=chain,
|
|
)
|
|
except Exception:
|
|
pass # Non-critical — response may have been aborted
|
|
|
|
page.on("response", _on_response)
|
|
|
|
# Navigate — networkidle waits until ≤2 active connections for
|
|
# 500ms, which catches the GA beacon round-trip that
|
|
# domcontentloaded misses.
|
|
await page.goto(url, wait_until="networkidle", timeout=self._timeout_ms)
|
|
# Safety margin for late-firing scripts (e.g. deferred GTM tags).
|
|
await page.wait_for_timeout(5000)
|
|
|
|
# First pass — enumerate browser cookies via CDP.
|
|
cdp_cookies = await context.cookies()
|
|
|
|
# Second pass — wait a further 2 seconds for any delayed
|
|
# Set-Cookie headers, then merge newly appeared cookies.
|
|
await page.wait_for_timeout(2000)
|
|
delayed_cookies = await context.cookies()
|
|
|
|
# Merge: index first-pass cookies by (name, domain), then
|
|
# add any that only appeared in the second pass.
|
|
seen_keys: set[tuple[str, str]] = set()
|
|
all_cdp_cookies: list[dict] = []
|
|
for c in cdp_cookies:
|
|
key = (c["name"], c["domain"])
|
|
seen_keys.add(key)
|
|
all_cdp_cookies.append(c)
|
|
for c in delayed_cookies:
|
|
key = (c["name"], c["domain"])
|
|
if key not in seen_keys:
|
|
seen_keys.add(key)
|
|
all_cdp_cookies.append(c)
|
|
|
|
for c in all_cdp_cookies:
|
|
result.cookies.append(
|
|
DiscoveredCookie(
|
|
name=c["name"],
|
|
domain=c["domain"],
|
|
storage_type="cookie",
|
|
path=c.get("path"),
|
|
expires=c.get("expires"),
|
|
http_only=c.get("httpOnly"),
|
|
secure=c.get("secure"),
|
|
same_site=c.get("sameSite"),
|
|
value_length=len(c.get("value", "")),
|
|
script_source=script_cookies.get(c["name"]),
|
|
page_url=url,
|
|
initiator_chain=initiator_chains.get(c["name"], []),
|
|
)
|
|
)
|
|
|
|
# Merge cookies seen in Set-Cookie headers but NOT in the
|
|
# CDP cookie jar (e.g. cross-domain cookies that the browser
|
|
# scoped to a different origin).
|
|
for key, hc in header_cookies.items():
|
|
if key not in seen_keys:
|
|
result.cookies.append(hc)
|
|
|
|
# Enumerate localStorage
|
|
ls_items = await page.evaluate("""() => {
|
|
const items = [];
|
|
try {
|
|
for (let i = 0; i < localStorage.length; i++) {
|
|
const key = localStorage.key(i);
|
|
if (key) {
|
|
items.push({
|
|
name: key,
|
|
valueLength: (localStorage.getItem(key) || '').length,
|
|
});
|
|
}
|
|
}
|
|
} catch (e) {}
|
|
return items;
|
|
}""")
|
|
hostname = urlparse(url).hostname or ""
|
|
for item in ls_items:
|
|
result.cookies.append(
|
|
DiscoveredCookie(
|
|
name=item["name"],
|
|
domain=hostname,
|
|
storage_type="local_storage",
|
|
value_length=item["valueLength"],
|
|
page_url=url,
|
|
)
|
|
)
|
|
|
|
# Enumerate sessionStorage
|
|
ss_items = await page.evaluate("""() => {
|
|
const items = [];
|
|
try {
|
|
for (let i = 0; i < sessionStorage.length; i++) {
|
|
const key = sessionStorage.key(i);
|
|
if (key) {
|
|
items.push({
|
|
name: key,
|
|
valueLength: (sessionStorage.getItem(key) || '').length,
|
|
});
|
|
}
|
|
}
|
|
} catch (e) {}
|
|
return items;
|
|
}""")
|
|
for item in ss_items:
|
|
result.cookies.append(
|
|
DiscoveredCookie(
|
|
name=item["name"],
|
|
domain=hostname,
|
|
storage_type="session_storage",
|
|
value_length=item["valueLength"],
|
|
page_url=url,
|
|
)
|
|
)
|
|
|
|
except Exception as exc:
|
|
result.error = str(exc)
|
|
logger.warning("Failed to crawl %s: %s", url, exc)
|
|
finally:
|
|
if context:
|
|
await context.close()
|
|
|
|
return result
|
|
|
|
|
|
def _get_script_initiator(request: Request) -> str | None:
|
|
"""Walk the request chain to find the originating script URL.
|
|
|
|
Returns a single script URL for backwards compatibility. For the full
|
|
initiator path, use :func:`_build_initiator_chain` instead.
|
|
"""
|
|
seen: set[str] = set()
|
|
current = request
|
|
while current:
|
|
url = current.url
|
|
if url in seen:
|
|
break
|
|
seen.add(url)
|
|
if url.endswith(".js") or "javascript" in (current.resource_type or ""):
|
|
return url
|
|
redirected = current.redirected_from
|
|
if redirected:
|
|
current = redirected
|
|
else:
|
|
break
|
|
return None
|
|
|
|
|
|
def _build_initiator_chain(
|
|
url: str,
|
|
initiator_map: dict[str, str],
|
|
max_depth: int = 20,
|
|
) -> list[str]:
|
|
"""Build the full initiator chain from a URL back to the root.
|
|
|
|
Walks the initiator map from *url* towards the top-level page,
|
|
producing a list ordered root-first (i.e. the page URL at index 0
|
|
and the leaf request URL at the end).
|
|
"""
|
|
chain = [url]
|
|
seen: set[str] = {url}
|
|
current = url
|
|
for _ in range(max_depth):
|
|
parent = initiator_map.get(current, "")
|
|
if not parent or parent in seen:
|
|
break
|
|
chain.append(parent)
|
|
seen.add(parent)
|
|
current = parent
|
|
chain.reverse() # Root first
|
|
return chain
|