fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)

Scanner fixes:
- Remove conflicting ``path`` from consent pre-seed cookie (Playwright
  rejects cookies with both ``url`` and ``path``).
- Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable
  cookie capture.
- Check sitemap Content-Type to skip SPA HTML fallbacks.
- Propagate ``auto_category`` from scan results to the cookies table
  during sync (was silently dropped).
- Add ``_gcl_ls`` to the Open Cookie Database CSV.
- Classify ``_consentos_*`` cookies as necessary directly in the
  classification engine.
- Add ``seed_known_cookies`` to the bootstrap init container command.

Admin UI:
- Add scan schedule control to the Scans tab — preset options
  (disabled/daily/weekly/fortnightly/monthly) plus custom cron input.
  Saves ``scan_schedule_cron`` on the site config.
This commit is contained in:
James Cottrill
2026-04-18 20:14:32 +01:00
committed by GitHub
parent 80dfc15319
commit e0f1dd43e8
11 changed files with 297 additions and 15 deletions

View File

@@ -2263,3 +2263,4 @@ c7d8e9f0-0012-4567-890a-000000000012,Plausible Analytics,Analytics,plausible_,,"
c7d8e9f0-0013-4567-890a-000000000013,Fathom Analytics,Analytics,_fathom,,"Privacy-focused simple website analytics with minimal data collection.",Varies,Conva Ventures Inc,https://usefathom.com/privacy,1
c7d8e9f0-0014-4567-890a-000000000014,Umami,Analytics,umami.,,"Open-source privacy-friendly web analytics alternative.",Varies,Website operator,https://umami.is/docs/about,1
c7d8e9f0-0015-4567-890a-000000000015,Vercel,Functional,_vercel_,,"Vercel platform cookies for deployment previews and analytics.",Varies,Vercel Inc,https://vercel.com/legal/privacy-policy,1
c7d8e9f0-0016-4567-890a-000000000016,Google Ads,Marketing,_gcl_ls,,"Google Click Identifier for localStorage-based ad conversion tracking.",90 Days,Google,https://business.safety.google/privacy/,0
Can't render this file because it is too large.

View File

@@ -174,6 +174,26 @@ def classify_cookie(
This is a pure function — all data is passed in, no DB calls.
"""
# 0. ConsentOS's own cookies are always necessary. The banner's
# blocker already treats ``_consentos_*`` as exempt; the
# classifier must agree so the admin UI shows them in the
# right category without requiring a known-cookies DB entry.
if cookie_name.startswith("_consentos_"):
necessary = next(
(cat for cat in category_map.values() if cat.slug == "necessary"),
None,
)
return ClassificationResult(
cookie_name=cookie_name,
cookie_domain=cookie_domain,
category_id=necessary.id if necessary else None,
category_slug="necessary",
vendor="ConsentOS",
description="ConsentOS consent management cookie.",
match_source=MatchSource.KNOWN_EXACT,
matched=True,
)
# 1. Check allow-list first (site-specific overrides)
allow_match = _match_allow_list(cookie_name, cookie_domain, allow_list)
if allow_match:

View File

@@ -12,7 +12,7 @@ from datetime import UTC, datetime
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from src.models.cookie import Cookie
from src.models.cookie import Cookie, CookieCategory
from src.models.scan import ScanJob, ScanResult
from src.models.site import Site
from src.schemas.scanner import (
@@ -261,7 +261,13 @@ async def sync_scan_results_to_cookies(
"""Upsert scan results into the site's cookie inventory.
Creates new Cookie records for newly discovered items or updates
last_seen_at for existing ones. Returns the number of new cookies.
``last_seen_at`` for existing ones. When ``auto_category`` is set
on the scan result and the cookie doesn't already have a
manually-assigned category, the auto-classified category is
propagated to the cookie inventory so it shows up categorised in
the admin UI without requiring manual review.
Returns the number of new cookies.
"""
results = await db.execute(select(ScanResult).where(ScanResult.scan_job_id == scan_job_id))
items = list(results.scalars().all())
@@ -269,6 +275,10 @@ async def sync_scan_results_to_cookies(
now_iso = datetime.now(UTC).isoformat()
new_count = 0
# Pre-load the category slug → id mapping so we don't query per cookie.
cat_rows = await db.execute(select(CookieCategory))
slug_to_id: dict[str, uuid.UUID] = {cat.slug: cat.id for cat in cat_rows.scalars().all()}
for item in items:
existing = await db.execute(
select(Cookie).where(
@@ -280,14 +290,21 @@ async def sync_scan_results_to_cookies(
)
cookie = existing.scalar_one_or_none()
# Resolve the auto-category slug to a category_id.
auto_cat_id = slug_to_id.get(item.auto_category) if item.auto_category else None
if cookie:
cookie.last_seen_at = now_iso
# Back-fill the category if not manually assigned yet.
if auto_cat_id and not cookie.category_id:
cookie.category_id = auto_cat_id
else:
cookie = Cookie(
site_id=site_id,
name=item.cookie_name,
domain=item.cookie_domain,
storage_type=item.storage_type,
category_id=auto_cat_id,
review_status="pending",
first_seen_at=now_iso,
last_seen_at=now_iso,