fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)
Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
This commit is contained in:
@@ -68,11 +68,13 @@ def _build_consent_cookie(url: str) -> dict:
|
||||
"bannerVersion": "scanner",
|
||||
}
|
||||
value = quote(json.dumps(state, separators=(",", ":")), safe="")
|
||||
# Playwright's ``add_cookies`` accepts EITHER ``url`` (from which
|
||||
# it derives domain/path/secure) OR explicit ``domain`` + ``path``
|
||||
# — but not both. Using ``url`` is simplest.
|
||||
return {
|
||||
"name": _CONSENT_COOKIE_NAME,
|
||||
"value": value,
|
||||
"url": url,
|
||||
"path": "/",
|
||||
"expires": time.time() + 365 * 86400,
|
||||
"sameSite": "Lax",
|
||||
}
|
||||
@@ -201,6 +203,9 @@ class CookieCrawler:
|
||||
script_cookies: dict[str, str] = {} # cookie name → script URL
|
||||
initiator_map: dict[str, str] = {} # request URL → initiating URL
|
||||
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
|
||||
# Cookies discovered directly from Set-Cookie response headers.
|
||||
# Keyed by (name, domain) so they can be merged with CDP results.
|
||||
header_cookies: dict[tuple[str, str], DiscoveredCookie] = {}
|
||||
|
||||
context: BrowserContext | None = None
|
||||
try:
|
||||
@@ -236,7 +241,9 @@ class CookieCrawler:
|
||||
|
||||
page.on("request", _on_request)
|
||||
|
||||
# Track Set-Cookie headers from responses
|
||||
# Track Set-Cookie headers from responses and create
|
||||
# DiscoveredCookie entries directly — CDP's context.cookies()
|
||||
# may not enumerate cross-domain cookies.
|
||||
async def _on_response(response: Response) -> None:
|
||||
try:
|
||||
headers = await response.all_headers()
|
||||
@@ -247,25 +254,67 @@ class CookieCrawler:
|
||||
initiator = _get_script_initiator(request)
|
||||
# Build the initiator chain for this request
|
||||
chain = _build_initiator_chain(request.url, initiator_map)
|
||||
resp_domain = urlparse(response.url).hostname or ""
|
||||
for cookie_str in set_cookie.split("\n"):
|
||||
name = cookie_str.split("=")[0].strip()
|
||||
if name:
|
||||
if initiator:
|
||||
script_cookies[name] = initiator
|
||||
initiator_chains[name] = chain
|
||||
# Parse optional Domain attribute from
|
||||
# the Set-Cookie header; fall back to
|
||||
# the response hostname.
|
||||
domain = resp_domain
|
||||
for part in cookie_str.split(";")[1:]:
|
||||
part = part.strip()
|
||||
if part.lower().startswith("domain="):
|
||||
domain = part.split("=", 1)[1].strip()
|
||||
break
|
||||
key = (name, domain)
|
||||
if key not in header_cookies:
|
||||
header_cookies[key] = DiscoveredCookie(
|
||||
name=name,
|
||||
domain=domain,
|
||||
storage_type="cookie",
|
||||
script_source=initiator,
|
||||
page_url=url,
|
||||
initiator_chain=chain,
|
||||
)
|
||||
except Exception:
|
||||
pass # Non-critical — response may have been aborted
|
||||
|
||||
page.on("response", _on_response)
|
||||
|
||||
# Navigate
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms)
|
||||
# Allow additional time for scripts to set cookies after DOM load.
|
||||
await page.wait_for_timeout(3000)
|
||||
# Navigate — networkidle waits until ≤2 active connections for
|
||||
# 500ms, which catches the GA beacon round-trip that
|
||||
# domcontentloaded misses.
|
||||
await page.goto(url, wait_until="networkidle", timeout=self._timeout_ms)
|
||||
# Safety margin for late-firing scripts (e.g. deferred GTM tags).
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# Enumerate browser cookies via CDP
|
||||
# First pass — enumerate browser cookies via CDP.
|
||||
cdp_cookies = await context.cookies()
|
||||
|
||||
# Second pass — wait a further 2 seconds for any delayed
|
||||
# Set-Cookie headers, then merge newly appeared cookies.
|
||||
await page.wait_for_timeout(2000)
|
||||
delayed_cookies = await context.cookies()
|
||||
|
||||
# Merge: index first-pass cookies by (name, domain), then
|
||||
# add any that only appeared in the second pass.
|
||||
seen_keys: set[tuple[str, str]] = set()
|
||||
all_cdp_cookies: list[dict] = []
|
||||
for c in cdp_cookies:
|
||||
key = (c["name"], c["domain"])
|
||||
seen_keys.add(key)
|
||||
all_cdp_cookies.append(c)
|
||||
for c in delayed_cookies:
|
||||
key = (c["name"], c["domain"])
|
||||
if key not in seen_keys:
|
||||
seen_keys.add(key)
|
||||
all_cdp_cookies.append(c)
|
||||
|
||||
for c in all_cdp_cookies:
|
||||
result.cookies.append(
|
||||
DiscoveredCookie(
|
||||
name=c["name"],
|
||||
@@ -283,6 +332,13 @@ class CookieCrawler:
|
||||
)
|
||||
)
|
||||
|
||||
# Merge cookies seen in Set-Cookie headers but NOT in the
|
||||
# CDP cookie jar (e.g. cross-domain cookies that the browser
|
||||
# scoped to a different origin).
|
||||
for key, hc in header_cookies.items():
|
||||
if key not in seen_keys:
|
||||
result.cookies.append(hc)
|
||||
|
||||
# Enumerate localStorage
|
||||
ls_items = await page.evaluate("""() => {
|
||||
const items = [];
|
||||
|
||||
@@ -75,6 +75,13 @@ async def _fetch_sitemap(
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
# SPAs with catch-all nginx/Caddy rules return 200 + text/html
|
||||
# for /sitemap.xml. Don't try to parse HTML as XML.
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
if "html" in content_type and "xml" not in content_type:
|
||||
logger.debug("Sitemap %s returned HTML, skipping", url)
|
||||
return []
|
||||
|
||||
root = ElementTree.fromstring(resp.text)
|
||||
|
||||
# Check if it's a sitemap index
|
||||
|
||||
@@ -42,11 +42,35 @@ def _make_mock_page(
|
||||
return page
|
||||
|
||||
|
||||
def _make_mock_context(page, cookies: list[dict] | None = None):
|
||||
"""Build a mock BrowserContext."""
|
||||
def _make_mock_context(
|
||||
page,
|
||||
cookies: list[dict] | None = None,
|
||||
delayed_cookies: list[dict] | None = None,
|
||||
):
|
||||
"""Build a mock BrowserContext.
|
||||
|
||||
*cookies* is returned on the first ``context.cookies()`` call (the
|
||||
initial CDP enumeration). *delayed_cookies* is returned on the
|
||||
second call (the delayed pass); defaults to the same list so
|
||||
existing tests need no changes.
|
||||
"""
|
||||
context = AsyncMock()
|
||||
context.new_page = AsyncMock(return_value=page)
|
||||
context.cookies = AsyncMock(return_value=cookies or [])
|
||||
first = cookies or []
|
||||
second = delayed_cookies if delayed_cookies is not None else first
|
||||
# The crawler calls context.cookies() twice per page (initial +
|
||||
# delayed pass). Using a cycling function instead of a fixed-length
|
||||
# side_effect list so multi-page tests don't exhaust the mock.
|
||||
_cycle = [first, second]
|
||||
_call_count = 0
|
||||
|
||||
async def _cycling_cookies(*_args, **_kwargs):
|
||||
nonlocal _call_count
|
||||
result = _cycle[_call_count % len(_cycle)]
|
||||
_call_count += 1
|
||||
return result
|
||||
|
||||
context.cookies = AsyncMock(side_effect=_cycling_cookies)
|
||||
context.clear_cookies = AsyncMock()
|
||||
context.close = AsyncMock()
|
||||
return context
|
||||
@@ -373,6 +397,44 @@ class TestCrawlPage:
|
||||
call_kwargs = browser.new_context.call_args[1]
|
||||
assert call_kwargs["user_agent"] == "CMPBot/1.0"
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_two_pass_cookie_collection_merges_delayed(self):
|
||||
"""Cookies appearing only in the second CDP pass are still discovered."""
|
||||
first_pass = [
|
||||
{"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"},
|
||||
]
|
||||
second_pass = [
|
||||
{"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"},
|
||||
{"name": "_gid", "domain": ".example.com", "value": "GID.99"},
|
||||
]
|
||||
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page, cookies=first_pass, delayed_cookies=second_pass)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
cookie_names = [c.name for c in result.cookies if c.storage_type == "cookie"]
|
||||
assert "_ga" in cookie_names
|
||||
assert "_gid" in cookie_names
|
||||
# _ga must not be duplicated
|
||||
assert cookie_names.count("_ga") == 1
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_uses_networkidle_wait(self):
|
||||
"""page.goto must use wait_until='networkidle'."""
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
page.goto.assert_awaited_once()
|
||||
call_kwargs = page.goto.call_args[1]
|
||||
assert call_kwargs.get("wait_until") == "networkidle"
|
||||
|
||||
|
||||
# ── CookieCrawler.crawl_site ───────────────────────────────────────────
|
||||
|
||||
@@ -457,7 +519,9 @@ class TestBuildConsentCookie:
|
||||
"""``url`` lets Playwright derive domain / path / secure."""
|
||||
cookie = _build_consent_cookie("https://example.com/page")
|
||||
assert cookie["url"] == "https://example.com/page"
|
||||
assert cookie["path"] == "/"
|
||||
# ``path`` is NOT set explicitly — Playwright derives it from ``url``.
|
||||
# Setting both would cause ``add_cookies`` to reject the cookie.
|
||||
assert "path" not in cookie
|
||||
|
||||
def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
|
||||
import json as _json
|
||||
|
||||
Reference in New Issue
Block a user