fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)

Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
2026-04-18 20:14:32 +01:00
parent 80dfc15319
commit e0f1dd43e8
11 changed files with 297 additions and 15 deletions
--- a/apps/scanner/src/crawler.py
+++ b/apps/scanner/src/crawler.py
@@ -68,11 +68,13 @@ def _build_consent_cookie(url: str) -> dict:
        "bannerVersion": "scanner",
    }
    value = quote(json.dumps(state, separators=(",", ":")), safe="")
+    # Playwright's ``add_cookies`` accepts EITHER ``url`` (from which
+    # it derives domain/path/secure) OR explicit ``domain`` + ``path``
+    # — but not both. Using ``url`` is simplest.
    return {
        "name": _CONSENT_COOKIE_NAME,
        "value": value,
        "url": url,
-        "path": "/",
        "expires": time.time() + 365 * 86400,
        "sameSite": "Lax",
    }
@@ -201,6 +203,9 @@ class CookieCrawler:
        script_cookies: dict[str, str] = {}  # cookie name → script URL
        initiator_map: dict[str, str] = {}  # request URL → initiating URL
        initiator_chains: dict[str, list[str]] = {}  # cookie name → chain
+        # Cookies discovered directly from Set-Cookie response headers.
+        # Keyed by (name, domain) so they can be merged with CDP results.
+        header_cookies: dict[tuple[str, str], DiscoveredCookie] = {}

        context: BrowserContext | None = None
        try:
@@ -236,7 +241,9 @@ class CookieCrawler:

            page.on("request", _on_request)

-            # Track Set-Cookie headers from responses
+            # Track Set-Cookie headers from responses and create
+            # DiscoveredCookie entries directly — CDP's context.cookies()
+            # may not enumerate cross-domain cookies.
            async def _on_response(response: Response) -> None:
                try:
                    headers = await response.all_headers()
@@ -247,25 +254,67 @@ class CookieCrawler:
                        initiator = _get_script_initiator(request)
                        # Build the initiator chain for this request
                        chain = _build_initiator_chain(request.url, initiator_map)
+                        resp_domain = urlparse(response.url).hostname or ""
                        for cookie_str in set_cookie.split("\n"):
                            name = cookie_str.split("=")[0].strip()
                            if name:
                                if initiator:
                                    script_cookies[name] = initiator
                                initiator_chains[name] = chain
+                                # Parse optional Domain attribute from
+                                # the Set-Cookie header; fall back to
+                                # the response hostname.
+                                domain = resp_domain
+                                for part in cookie_str.split(";")[1:]:
+                                    part = part.strip()
+                                    if part.lower().startswith("domain="):
+                                        domain = part.split("=", 1)[1].strip()
+                                        break
+                                key = (name, domain)
+                                if key not in header_cookies:
+                                    header_cookies[key] = DiscoveredCookie(
+                                        name=name,
+                                        domain=domain,
+                                        storage_type="cookie",
+                                        script_source=initiator,
+                                        page_url=url,
+                                        initiator_chain=chain,
+                                    )
                except Exception:
                    pass  # Non-critical — response may have been aborted

            page.on("response", _on_response)

-            # Navigate
-            await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms)
-            # Allow additional time for scripts to set cookies after DOM load.
-            await page.wait_for_timeout(3000)
+            # Navigate — networkidle waits until ≤2 active connections for
+            # 500ms, which catches the GA beacon round-trip that
+            # domcontentloaded misses.
+            await page.goto(url, wait_until="networkidle", timeout=self._timeout_ms)
+            # Safety margin for late-firing scripts (e.g. deferred GTM tags).
+            await page.wait_for_timeout(5000)

-            # Enumerate browser cookies via CDP
+            # First pass — enumerate browser cookies via CDP.
            cdp_cookies = await context.cookies()
+
+            # Second pass — wait a further 2 seconds for any delayed
+            # Set-Cookie headers, then merge newly appeared cookies.
+            await page.wait_for_timeout(2000)
+            delayed_cookies = await context.cookies()
+
+            # Merge: index first-pass cookies by (name, domain), then
+            # add any that only appeared in the second pass.
+            seen_keys: set[tuple[str, str]] = set()
+            all_cdp_cookies: list[dict] = []
            for c in cdp_cookies:
+                key = (c["name"], c["domain"])
+                seen_keys.add(key)
+                all_cdp_cookies.append(c)
+            for c in delayed_cookies:
+                key = (c["name"], c["domain"])
+                if key not in seen_keys:
+                    seen_keys.add(key)
+                    all_cdp_cookies.append(c)
+
+            for c in all_cdp_cookies:
                result.cookies.append(
                    DiscoveredCookie(
                        name=c["name"],
@@ -283,6 +332,13 @@ class CookieCrawler:
                    )
                )

+            # Merge cookies seen in Set-Cookie headers but NOT in the
+            # CDP cookie jar (e.g. cross-domain cookies that the browser
+            # scoped to a different origin).
+            for key, hc in header_cookies.items():
+                if key not in seen_keys:
+                    result.cookies.append(hc)
+
            # Enumerate localStorage
            ls_items = await page.evaluate("""() => {
                const items = [];
--- a/apps/scanner/src/sitemap.py
+++ b/apps/scanner/src/sitemap.py
@@ -75,6 +75,13 @@ async def _fetch_sitemap(
        if resp.status_code != 200:
            return []

+        # SPAs with catch-all nginx/Caddy rules return 200 + text/html
+        # for /sitemap.xml. Don't try to parse HTML as XML.
+        content_type = resp.headers.get("content-type", "")
+        if "html" in content_type and "xml" not in content_type:
+            logger.debug("Sitemap %s returned HTML, skipping", url)
+            return []
+
        root = ElementTree.fromstring(resp.text)

        # Check if it's a sitemap index
--- a/apps/scanner/tests/test_crawler.py
+++ b/apps/scanner/tests/test_crawler.py
@@ -42,11 +42,35 @@ def _make_mock_page(
    return page


-def _make_mock_context(page, cookies: list[dict] | None = None):
-    """Build a mock BrowserContext."""
+def _make_mock_context(
+    page,
+    cookies: list[dict] | None = None,
+    delayed_cookies: list[dict] | None = None,
+):
+    """Build a mock BrowserContext.
+
+    *cookies* is returned on the first ``context.cookies()`` call (the
+    initial CDP enumeration).  *delayed_cookies* is returned on the
+    second call (the delayed pass); defaults to the same list so
+    existing tests need no changes.
+    """
    context = AsyncMock()
    context.new_page = AsyncMock(return_value=page)
-    context.cookies = AsyncMock(return_value=cookies or [])
+    first = cookies or []
+    second = delayed_cookies if delayed_cookies is not None else first
+    # The crawler calls context.cookies() twice per page (initial +
+    # delayed pass). Using a cycling function instead of a fixed-length
+    # side_effect list so multi-page tests don't exhaust the mock.
+    _cycle = [first, second]
+    _call_count = 0
+
+    async def _cycling_cookies(*_args, **_kwargs):
+        nonlocal _call_count
+        result = _cycle[_call_count % len(_cycle)]
+        _call_count += 1
+        return result
+
+    context.cookies = AsyncMock(side_effect=_cycling_cookies)
    context.clear_cookies = AsyncMock()
    context.close = AsyncMock()
    return context
@@ -373,6 +397,44 @@ class TestCrawlPage:
        call_kwargs = browser.new_context.call_args[1]
        assert call_kwargs["user_agent"] == "CMPBot/1.0"

+    @pytest.mark.asyncio(loop_scope="session")
+    async def test_two_pass_cookie_collection_merges_delayed(self):
+        """Cookies appearing only in the second CDP pass are still discovered."""
+        first_pass = [
+            {"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"},
+        ]
+        second_pass = [
+            {"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"},
+            {"name": "_gid", "domain": ".example.com", "value": "GID.99"},
+        ]
+
+        page = _make_mock_page()
+        context = _make_mock_context(page, cookies=first_pass, delayed_cookies=second_pass)
+        browser = _make_mock_browser(context)
+
+        crawler = CookieCrawler()
+        result = await crawler._crawl_page(browser, "https://example.com/")
+
+        cookie_names = [c.name for c in result.cookies if c.storage_type == "cookie"]
+        assert "_ga" in cookie_names
+        assert "_gid" in cookie_names
+        # _ga must not be duplicated
+        assert cookie_names.count("_ga") == 1
+
+    @pytest.mark.asyncio(loop_scope="session")
+    async def test_uses_networkidle_wait(self):
+        """page.goto must use wait_until='networkidle'."""
+        page = _make_mock_page()
+        context = _make_mock_context(page)
+        browser = _make_mock_browser(context)
+
+        crawler = CookieCrawler()
+        await crawler._crawl_page(browser, "https://example.com/")
+
+        page.goto.assert_awaited_once()
+        call_kwargs = page.goto.call_args[1]
+        assert call_kwargs.get("wait_until") == "networkidle"
+

 # ── CookieCrawler.crawl_site ───────────────────────────────────────────

@@ -457,7 +519,9 @@ class TestBuildConsentCookie:
        """``url`` lets Playwright derive domain / path / secure."""
        cookie = _build_consent_cookie("https://example.com/page")
        assert cookie["url"] == "https://example.com/page"
-        assert cookie["path"] == "/"
+        # ``path`` is NOT set explicitly — Playwright derives it from ``url``.
+        # Setting both would cause ``add_cookies`` to reject the cookie.
+        assert "path" not in cookie

    def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
        import json as _json