fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)

Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
2026-04-18 20:14:32 +01:00
parent 80dfc15319
commit e0f1dd43e8
11 changed files with 297 additions and 15 deletions
--- a/apps/api/data/open-cookie-database.csv
+++ b/apps/api/data/open-cookie-database.csv
@@ -2263,3 +2263,4 @@ c7d8e9f0-0012-4567-890a-000000000012,Plausible Analytics,Analytics,plausible_,,"
 c7d8e9f0-0013-4567-890a-000000000013,Fathom Analytics,Analytics,_fathom,,"Privacy-focused simple website analytics with minimal data collection.",Varies,Conva Ventures Inc,https://usefathom.com/privacy,1
 c7d8e9f0-0014-4567-890a-000000000014,Umami,Analytics,umami.,,"Open-source privacy-friendly web analytics alternative.",Varies,Website operator,https://umami.is/docs/about,1
 c7d8e9f0-0015-4567-890a-000000000015,Vercel,Functional,_vercel_,,"Vercel platform cookies for deployment previews and analytics.",Varies,Vercel Inc,https://vercel.com/legal/privacy-policy,1
+c7d8e9f0-0016-4567-890a-000000000016,Google Ads,Marketing,_gcl_ls,,"Google Click Identifier for localStorage-based ad conversion tracking.",90 Days,Google,https://business.safety.google/privacy/,0
--- a/apps/api/src/services/classification.py
+++ b/apps/api/src/services/classification.py
@@ -174,6 +174,26 @@ def classify_cookie(

    This is a pure function — all data is passed in, no DB calls.
    """
+    # 0. ConsentOS's own cookies are always necessary. The banner's
+    #    blocker already treats ``_consentos_*`` as exempt; the
+    #    classifier must agree so the admin UI shows them in the
+    #    right category without requiring a known-cookies DB entry.
+    if cookie_name.startswith("_consentos_"):
+        necessary = next(
+            (cat for cat in category_map.values() if cat.slug == "necessary"),
+            None,
+        )
+        return ClassificationResult(
+            cookie_name=cookie_name,
+            cookie_domain=cookie_domain,
+            category_id=necessary.id if necessary else None,
+            category_slug="necessary",
+            vendor="ConsentOS",
+            description="ConsentOS consent management cookie.",
+            match_source=MatchSource.KNOWN_EXACT,
+            matched=True,
+        )
+
    # 1. Check allow-list first (site-specific overrides)
    allow_match = _match_allow_list(cookie_name, cookie_domain, allow_list)
    if allow_match:
--- a/apps/api/src/services/scanner.py
+++ b/apps/api/src/services/scanner.py
@@ -12,7 +12,7 @@ from datetime import UTC, datetime
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession

-from src.models.cookie import Cookie
+from src.models.cookie import Cookie, CookieCategory
 from src.models.scan import ScanJob, ScanResult
 from src.models.site import Site
 from src.schemas.scanner import (
@@ -261,7 +261,13 @@ async def sync_scan_results_to_cookies(
    """Upsert scan results into the site's cookie inventory.

    Creates new Cookie records for newly discovered items or updates
-    last_seen_at for existing ones. Returns the number of new cookies.
+    ``last_seen_at`` for existing ones. When ``auto_category`` is set
+    on the scan result and the cookie doesn't already have a
+    manually-assigned category, the auto-classified category is
+    propagated to the cookie inventory so it shows up categorised in
+    the admin UI without requiring manual review.
+
+    Returns the number of new cookies.
    """
    results = await db.execute(select(ScanResult).where(ScanResult.scan_job_id == scan_job_id))
    items = list(results.scalars().all())
@@ -269,6 +275,10 @@ async def sync_scan_results_to_cookies(
    now_iso = datetime.now(UTC).isoformat()
    new_count = 0

+    # Pre-load the category slug → id mapping so we don't query per cookie.
+    cat_rows = await db.execute(select(CookieCategory))
+    slug_to_id: dict[str, uuid.UUID] = {cat.slug: cat.id for cat in cat_rows.scalars().all()}
+
    for item in items:
        existing = await db.execute(
            select(Cookie).where(
@@ -280,14 +290,21 @@ async def sync_scan_results_to_cookies(
        )
        cookie = existing.scalar_one_or_none()

+        # Resolve the auto-category slug to a category_id.
+        auto_cat_id = slug_to_id.get(item.auto_category) if item.auto_category else None
+
        if cookie:
            cookie.last_seen_at = now_iso
+            # Back-fill the category if not manually assigned yet.
+            if auto_cat_id and not cookie.category_id:
+                cookie.category_id = auto_cat_id
        else:
            cookie = Cookie(
                site_id=site_id,
                name=item.cookie_name,
                domain=item.cookie_domain,
                storage_type=item.storage_type,
+                category_id=auto_cat_id,
                review_status="pending",
                first_seen_at=now_iso,
                last_seen_at=now_iso,