fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)
Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
This commit is contained in:
@@ -2263,3 +2263,4 @@ c7d8e9f0-0012-4567-890a-000000000012,Plausible Analytics,Analytics,plausible_,,"
|
||||
c7d8e9f0-0013-4567-890a-000000000013,Fathom Analytics,Analytics,_fathom,,"Privacy-focused simple website analytics with minimal data collection.",Varies,Conva Ventures Inc,https://usefathom.com/privacy,1
|
||||
c7d8e9f0-0014-4567-890a-000000000014,Umami,Analytics,umami.,,"Open-source privacy-friendly web analytics alternative.",Varies,Website operator,https://umami.is/docs/about,1
|
||||
c7d8e9f0-0015-4567-890a-000000000015,Vercel,Functional,_vercel_,,"Vercel platform cookies for deployment previews and analytics.",Varies,Vercel Inc,https://vercel.com/legal/privacy-policy,1
|
||||
c7d8e9f0-0016-4567-890a-000000000016,Google Ads,Marketing,_gcl_ls,,"Google Click Identifier for localStorage-based ad conversion tracking.",90 Days,Google,https://business.safety.google/privacy/,0
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -174,6 +174,26 @@ def classify_cookie(
|
||||
|
||||
This is a pure function — all data is passed in, no DB calls.
|
||||
"""
|
||||
# 0. ConsentOS's own cookies are always necessary. The banner's
|
||||
# blocker already treats ``_consentos_*`` as exempt; the
|
||||
# classifier must agree so the admin UI shows them in the
|
||||
# right category without requiring a known-cookies DB entry.
|
||||
if cookie_name.startswith("_consentos_"):
|
||||
necessary = next(
|
||||
(cat for cat in category_map.values() if cat.slug == "necessary"),
|
||||
None,
|
||||
)
|
||||
return ClassificationResult(
|
||||
cookie_name=cookie_name,
|
||||
cookie_domain=cookie_domain,
|
||||
category_id=necessary.id if necessary else None,
|
||||
category_slug="necessary",
|
||||
vendor="ConsentOS",
|
||||
description="ConsentOS consent management cookie.",
|
||||
match_source=MatchSource.KNOWN_EXACT,
|
||||
matched=True,
|
||||
)
|
||||
|
||||
# 1. Check allow-list first (site-specific overrides)
|
||||
allow_match = _match_allow_list(cookie_name, cookie_domain, allow_list)
|
||||
if allow_match:
|
||||
|
||||
@@ -12,7 +12,7 @@ from datetime import UTC, datetime
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.models.cookie import Cookie
|
||||
from src.models.cookie import Cookie, CookieCategory
|
||||
from src.models.scan import ScanJob, ScanResult
|
||||
from src.models.site import Site
|
||||
from src.schemas.scanner import (
|
||||
@@ -261,7 +261,13 @@ async def sync_scan_results_to_cookies(
|
||||
"""Upsert scan results into the site's cookie inventory.
|
||||
|
||||
Creates new Cookie records for newly discovered items or updates
|
||||
last_seen_at for existing ones. Returns the number of new cookies.
|
||||
``last_seen_at`` for existing ones. When ``auto_category`` is set
|
||||
on the scan result and the cookie doesn't already have a
|
||||
manually-assigned category, the auto-classified category is
|
||||
propagated to the cookie inventory so it shows up categorised in
|
||||
the admin UI without requiring manual review.
|
||||
|
||||
Returns the number of new cookies.
|
||||
"""
|
||||
results = await db.execute(select(ScanResult).where(ScanResult.scan_job_id == scan_job_id))
|
||||
items = list(results.scalars().all())
|
||||
@@ -269,6 +275,10 @@ async def sync_scan_results_to_cookies(
|
||||
now_iso = datetime.now(UTC).isoformat()
|
||||
new_count = 0
|
||||
|
||||
# Pre-load the category slug → id mapping so we don't query per cookie.
|
||||
cat_rows = await db.execute(select(CookieCategory))
|
||||
slug_to_id: dict[str, uuid.UUID] = {cat.slug: cat.id for cat in cat_rows.scalars().all()}
|
||||
|
||||
for item in items:
|
||||
existing = await db.execute(
|
||||
select(Cookie).where(
|
||||
@@ -280,14 +290,21 @@ async def sync_scan_results_to_cookies(
|
||||
)
|
||||
cookie = existing.scalar_one_or_none()
|
||||
|
||||
# Resolve the auto-category slug to a category_id.
|
||||
auto_cat_id = slug_to_id.get(item.auto_category) if item.auto_category else None
|
||||
|
||||
if cookie:
|
||||
cookie.last_seen_at = now_iso
|
||||
# Back-fill the category if not manually assigned yet.
|
||||
if auto_cat_id and not cookie.category_id:
|
||||
cookie.category_id = auto_cat_id
|
||||
else:
|
||||
cookie = Cookie(
|
||||
site_id=site_id,
|
||||
name=item.cookie_name,
|
||||
domain=item.cookie_domain,
|
||||
storage_type=item.storage_type,
|
||||
category_id=auto_cat_id,
|
||||
review_status="pending",
|
||||
first_seen_at=now_iso,
|
||||
last_seen_at=now_iso,
|
||||
|
||||
Reference in New Issue
Block a user