diff --git a/apps/admin-ui/src/components/SiteScannerTab.tsx b/apps/admin-ui/src/components/SiteScannerTab.tsx index e486f9f..ae83fdd 100644 --- a/apps/admin-ui/src/components/SiteScannerTab.tsx +++ b/apps/admin-ui/src/components/SiteScannerTab.tsx @@ -2,17 +2,34 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; import { Fragment, useState } from 'react'; import { getScan, getScanDiff, listScans, triggerScan } from '../api/scanner'; +import { getSiteConfig, updateSiteConfig } from '../api/sites'; import { trackFeatureUsage } from '../services/analytics'; -import type { CookieDiffItem, ScanDiff, ScanJob, ScanJobDetail, ScanResult } from '../types/api'; +import type { CookieDiffItem, ScanDiff, ScanJob, ScanJobDetail, ScanResult, SiteConfig } from '../types/api'; import { Alert } from './ui/alert'; import { Badge } from './ui/badge'; import { Button } from './ui/button'; +import { Card } from './ui/card'; import { LoadingState } from './ui/loading-state'; +import { Select } from './ui/select'; interface Props { siteId: string; } +const SCHEDULE_OPTIONS: { value: string; label: string; cron: string | null }[] = [ + { value: 'disabled', label: 'Disabled', cron: null }, + { value: 'daily', label: 'Daily', cron: '0 3 * * *' }, + { value: 'weekly', label: 'Weekly', cron: '0 3 * * 0' }, + { value: 'fortnightly', label: 'Fortnightly', cron: '0 3 1,15 * *' }, + { value: 'monthly', label: 'Monthly', cron: '0 3 1 * *' }, +]; + +function cronToScheduleValue(cron: string | null | undefined): string { + if (!cron) return 'disabled'; + const match = SCHEDULE_OPTIONS.find((o) => o.cron === cron); + return match?.value ?? 'custom'; +} + function statusVariant(status: string): 'warning' | 'info' | 'success' | 'error' | 'neutral' { const map: Record = { pending: 'warning', @@ -183,6 +200,45 @@ export default function SiteScannerTab({ siteId }: Props) { const queryClient = useQueryClient(); const [expandedScanId, setExpandedScanId] = useState(null); + const { data: config } = useQuery({ + queryKey: ['sites', siteId, 'config'], + queryFn: () => getSiteConfig(siteId), + }); + + const currentCron = config?.scan_schedule_cron ?? null; + const savedValue = cronToScheduleValue(currentCron); + const [selectedSchedule, setSelectedSchedule] = useState(null); + const [customCron, setCustomCron] = useState(''); + + // Use local selection if the user has interacted, otherwise fall + // back to what's saved on the server. + const activeValue = selectedSchedule ?? savedValue; + const showCustomInput = activeValue === 'custom'; + + const scheduleMutation = useMutation({ + mutationFn: (cron: string | null) => updateSiteConfig(siteId, { scan_schedule_cron: cron } as Partial), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['sites', siteId, 'config'] }); + trackFeatureUsage('scan', 'schedule_change', { site_id: siteId }); + setSelectedSchedule(null); // reset to server state + }, + }); + + const handleScheduleChange = (value: string) => { + setSelectedSchedule(value); + if (value === 'custom') { + setCustomCron(currentCron ?? ''); + return; + } + const option = SCHEDULE_OPTIONS.find((o) => o.value === value); + scheduleMutation.mutate(option?.cron ?? null); + }; + + const handleCustomSave = () => { + const trimmed = customCron.trim(); + scheduleMutation.mutate(trimmed || null); + }; + const { data: scans, isLoading } = useQuery({ queryKey: ['scans', siteId], queryFn: () => listScans(siteId), @@ -202,6 +258,64 @@ export default function SiteScannerTab({ siteId }: Props) { return (
+ {/* Scan schedule */} + +

Scan Schedule

+

+ Scheduled scans run automatically and re-discover cookies so your inventory stays + current. Select a preset or enter a custom cron expression. +

+
+
+ +
+ {showCustomInput && ( + <> + setCustomCron(e.target.value)} + /> + + + Need help? Use crontab.guru → + + + )} + {scheduleMutation.isPending && ( + Saving… + )} +
+ {currentCron && ( +

+ Current schedule: {currentCron} +

+ )} +
+ {/* Header with trigger button */}

Cookie Scans

diff --git a/apps/admin-ui/src/test/SiteCategoriesTab.test.tsx b/apps/admin-ui/src/test/SiteCategoriesTab.test.tsx index 956c13c..3f098ba 100644 --- a/apps/admin-ui/src/test/SiteCategoriesTab.test.tsx +++ b/apps/admin-ui/src/test/SiteCategoriesTab.test.tsx @@ -43,6 +43,7 @@ const BASE_CONFIG: SiteConfig = { scan_enabled: true, scan_frequency_hours: 168, scan_max_pages: 50, + scan_schedule_cron: null, enabled_categories: null, created_at: '2025-01-01T00:00:00Z', updated_at: '2025-01-01T00:00:00Z', diff --git a/apps/admin-ui/src/test/SiteConfigTab.test.tsx b/apps/admin-ui/src/test/SiteConfigTab.test.tsx index fba00d2..62784e9 100644 --- a/apps/admin-ui/src/test/SiteConfigTab.test.tsx +++ b/apps/admin-ui/src/test/SiteConfigTab.test.tsx @@ -41,6 +41,7 @@ const BASE_CONFIG: SiteConfig = { scan_enabled: true, scan_frequency_hours: 168, scan_max_pages: 50, + scan_schedule_cron: null, enabled_categories: null, created_at: '2025-01-01T00:00:00Z', updated_at: '2025-01-01T00:00:00Z', diff --git a/apps/admin-ui/src/types/api.ts b/apps/admin-ui/src/types/api.ts index d0636b3..ef2026d 100644 --- a/apps/admin-ui/src/types/api.ts +++ b/apps/admin-ui/src/types/api.ts @@ -129,6 +129,7 @@ export interface SiteConfig { scan_enabled: boolean; scan_frequency_hours: number; scan_max_pages: number; + scan_schedule_cron: string | null; /** * Cookie categories the banner should display. ``null`` means * "inherit from the cascade" (group → org → system default of all diff --git a/apps/api/data/open-cookie-database.csv b/apps/api/data/open-cookie-database.csv index b44506a..c91fefa 100644 --- a/apps/api/data/open-cookie-database.csv +++ b/apps/api/data/open-cookie-database.csv @@ -2263,3 +2263,4 @@ c7d8e9f0-0012-4567-890a-000000000012,Plausible Analytics,Analytics,plausible_,," c7d8e9f0-0013-4567-890a-000000000013,Fathom Analytics,Analytics,_fathom,,"Privacy-focused simple website analytics with minimal data collection.",Varies,Conva Ventures Inc,https://usefathom.com/privacy,1 c7d8e9f0-0014-4567-890a-000000000014,Umami,Analytics,umami.,,"Open-source privacy-friendly web analytics alternative.",Varies,Website operator,https://umami.is/docs/about,1 c7d8e9f0-0015-4567-890a-000000000015,Vercel,Functional,_vercel_,,"Vercel platform cookies for deployment previews and analytics.",Varies,Vercel Inc,https://vercel.com/legal/privacy-policy,1 +c7d8e9f0-0016-4567-890a-000000000016,Google Ads,Marketing,_gcl_ls,,"Google Click Identifier for localStorage-based ad conversion tracking.",90 Days,Google,https://business.safety.google/privacy/,0 diff --git a/apps/api/src/services/classification.py b/apps/api/src/services/classification.py index ec7ecc3..9faeeec 100644 --- a/apps/api/src/services/classification.py +++ b/apps/api/src/services/classification.py @@ -174,6 +174,26 @@ def classify_cookie( This is a pure function — all data is passed in, no DB calls. """ + # 0. ConsentOS's own cookies are always necessary. The banner's + # blocker already treats ``_consentos_*`` as exempt; the + # classifier must agree so the admin UI shows them in the + # right category without requiring a known-cookies DB entry. + if cookie_name.startswith("_consentos_"): + necessary = next( + (cat for cat in category_map.values() if cat.slug == "necessary"), + None, + ) + return ClassificationResult( + cookie_name=cookie_name, + cookie_domain=cookie_domain, + category_id=necessary.id if necessary else None, + category_slug="necessary", + vendor="ConsentOS", + description="ConsentOS consent management cookie.", + match_source=MatchSource.KNOWN_EXACT, + matched=True, + ) + # 1. Check allow-list first (site-specific overrides) allow_match = _match_allow_list(cookie_name, cookie_domain, allow_list) if allow_match: diff --git a/apps/api/src/services/scanner.py b/apps/api/src/services/scanner.py index fc56f47..81f8bb0 100644 --- a/apps/api/src/services/scanner.py +++ b/apps/api/src/services/scanner.py @@ -12,7 +12,7 @@ from datetime import UTC, datetime from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.models.cookie import Cookie +from src.models.cookie import Cookie, CookieCategory from src.models.scan import ScanJob, ScanResult from src.models.site import Site from src.schemas.scanner import ( @@ -261,7 +261,13 @@ async def sync_scan_results_to_cookies( """Upsert scan results into the site's cookie inventory. Creates new Cookie records for newly discovered items or updates - last_seen_at for existing ones. Returns the number of new cookies. + ``last_seen_at`` for existing ones. When ``auto_category`` is set + on the scan result and the cookie doesn't already have a + manually-assigned category, the auto-classified category is + propagated to the cookie inventory so it shows up categorised in + the admin UI without requiring manual review. + + Returns the number of new cookies. """ results = await db.execute(select(ScanResult).where(ScanResult.scan_job_id == scan_job_id)) items = list(results.scalars().all()) @@ -269,6 +275,10 @@ async def sync_scan_results_to_cookies( now_iso = datetime.now(UTC).isoformat() new_count = 0 + # Pre-load the category slug → id mapping so we don't query per cookie. + cat_rows = await db.execute(select(CookieCategory)) + slug_to_id: dict[str, uuid.UUID] = {cat.slug: cat.id for cat in cat_rows.scalars().all()} + for item in items: existing = await db.execute( select(Cookie).where( @@ -280,14 +290,21 @@ async def sync_scan_results_to_cookies( ) cookie = existing.scalar_one_or_none() + # Resolve the auto-category slug to a category_id. + auto_cat_id = slug_to_id.get(item.auto_category) if item.auto_category else None + if cookie: cookie.last_seen_at = now_iso + # Back-fill the category if not manually assigned yet. + if auto_cat_id and not cookie.category_id: + cookie.category_id = auto_cat_id else: cookie = Cookie( site_id=site_id, name=item.cookie_name, domain=item.cookie_domain, storage_type=item.storage_type, + category_id=auto_cat_id, review_status="pending", first_seen_at=now_iso, last_seen_at=now_iso, diff --git a/apps/scanner/src/crawler.py b/apps/scanner/src/crawler.py index c87f5fb..bffd1c9 100644 --- a/apps/scanner/src/crawler.py +++ b/apps/scanner/src/crawler.py @@ -68,11 +68,13 @@ def _build_consent_cookie(url: str) -> dict: "bannerVersion": "scanner", } value = quote(json.dumps(state, separators=(",", ":")), safe="") + # Playwright's ``add_cookies`` accepts EITHER ``url`` (from which + # it derives domain/path/secure) OR explicit ``domain`` + ``path`` + # — but not both. Using ``url`` is simplest. return { "name": _CONSENT_COOKIE_NAME, "value": value, "url": url, - "path": "/", "expires": time.time() + 365 * 86400, "sameSite": "Lax", } @@ -201,6 +203,9 @@ class CookieCrawler: script_cookies: dict[str, str] = {} # cookie name → script URL initiator_map: dict[str, str] = {} # request URL → initiating URL initiator_chains: dict[str, list[str]] = {} # cookie name → chain + # Cookies discovered directly from Set-Cookie response headers. + # Keyed by (name, domain) so they can be merged with CDP results. + header_cookies: dict[tuple[str, str], DiscoveredCookie] = {} context: BrowserContext | None = None try: @@ -236,7 +241,9 @@ class CookieCrawler: page.on("request", _on_request) - # Track Set-Cookie headers from responses + # Track Set-Cookie headers from responses and create + # DiscoveredCookie entries directly — CDP's context.cookies() + # may not enumerate cross-domain cookies. async def _on_response(response: Response) -> None: try: headers = await response.all_headers() @@ -247,25 +254,67 @@ class CookieCrawler: initiator = _get_script_initiator(request) # Build the initiator chain for this request chain = _build_initiator_chain(request.url, initiator_map) + resp_domain = urlparse(response.url).hostname or "" for cookie_str in set_cookie.split("\n"): name = cookie_str.split("=")[0].strip() if name: if initiator: script_cookies[name] = initiator initiator_chains[name] = chain + # Parse optional Domain attribute from + # the Set-Cookie header; fall back to + # the response hostname. + domain = resp_domain + for part in cookie_str.split(";")[1:]: + part = part.strip() + if part.lower().startswith("domain="): + domain = part.split("=", 1)[1].strip() + break + key = (name, domain) + if key not in header_cookies: + header_cookies[key] = DiscoveredCookie( + name=name, + domain=domain, + storage_type="cookie", + script_source=initiator, + page_url=url, + initiator_chain=chain, + ) except Exception: pass # Non-critical — response may have been aborted page.on("response", _on_response) - # Navigate - await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms) - # Allow additional time for scripts to set cookies after DOM load. - await page.wait_for_timeout(3000) + # Navigate — networkidle waits until ≤2 active connections for + # 500ms, which catches the GA beacon round-trip that + # domcontentloaded misses. + await page.goto(url, wait_until="networkidle", timeout=self._timeout_ms) + # Safety margin for late-firing scripts (e.g. deferred GTM tags). + await page.wait_for_timeout(5000) - # Enumerate browser cookies via CDP + # First pass — enumerate browser cookies via CDP. cdp_cookies = await context.cookies() + + # Second pass — wait a further 2 seconds for any delayed + # Set-Cookie headers, then merge newly appeared cookies. + await page.wait_for_timeout(2000) + delayed_cookies = await context.cookies() + + # Merge: index first-pass cookies by (name, domain), then + # add any that only appeared in the second pass. + seen_keys: set[tuple[str, str]] = set() + all_cdp_cookies: list[dict] = [] for c in cdp_cookies: + key = (c["name"], c["domain"]) + seen_keys.add(key) + all_cdp_cookies.append(c) + for c in delayed_cookies: + key = (c["name"], c["domain"]) + if key not in seen_keys: + seen_keys.add(key) + all_cdp_cookies.append(c) + + for c in all_cdp_cookies: result.cookies.append( DiscoveredCookie( name=c["name"], @@ -283,6 +332,13 @@ class CookieCrawler: ) ) + # Merge cookies seen in Set-Cookie headers but NOT in the + # CDP cookie jar (e.g. cross-domain cookies that the browser + # scoped to a different origin). + for key, hc in header_cookies.items(): + if key not in seen_keys: + result.cookies.append(hc) + # Enumerate localStorage ls_items = await page.evaluate("""() => { const items = []; diff --git a/apps/scanner/src/sitemap.py b/apps/scanner/src/sitemap.py index 32e6c91..f65f4a3 100644 --- a/apps/scanner/src/sitemap.py +++ b/apps/scanner/src/sitemap.py @@ -75,6 +75,13 @@ async def _fetch_sitemap( if resp.status_code != 200: return [] + # SPAs with catch-all nginx/Caddy rules return 200 + text/html + # for /sitemap.xml. Don't try to parse HTML as XML. + content_type = resp.headers.get("content-type", "") + if "html" in content_type and "xml" not in content_type: + logger.debug("Sitemap %s returned HTML, skipping", url) + return [] + root = ElementTree.fromstring(resp.text) # Check if it's a sitemap index diff --git a/apps/scanner/tests/test_crawler.py b/apps/scanner/tests/test_crawler.py index 797c08b..3c987b3 100644 --- a/apps/scanner/tests/test_crawler.py +++ b/apps/scanner/tests/test_crawler.py @@ -42,11 +42,35 @@ def _make_mock_page( return page -def _make_mock_context(page, cookies: list[dict] | None = None): - """Build a mock BrowserContext.""" +def _make_mock_context( + page, + cookies: list[dict] | None = None, + delayed_cookies: list[dict] | None = None, +): + """Build a mock BrowserContext. + + *cookies* is returned on the first ``context.cookies()`` call (the + initial CDP enumeration). *delayed_cookies* is returned on the + second call (the delayed pass); defaults to the same list so + existing tests need no changes. + """ context = AsyncMock() context.new_page = AsyncMock(return_value=page) - context.cookies = AsyncMock(return_value=cookies or []) + first = cookies or [] + second = delayed_cookies if delayed_cookies is not None else first + # The crawler calls context.cookies() twice per page (initial + + # delayed pass). Using a cycling function instead of a fixed-length + # side_effect list so multi-page tests don't exhaust the mock. + _cycle = [first, second] + _call_count = 0 + + async def _cycling_cookies(*_args, **_kwargs): + nonlocal _call_count + result = _cycle[_call_count % len(_cycle)] + _call_count += 1 + return result + + context.cookies = AsyncMock(side_effect=_cycling_cookies) context.clear_cookies = AsyncMock() context.close = AsyncMock() return context @@ -373,6 +397,44 @@ class TestCrawlPage: call_kwargs = browser.new_context.call_args[1] assert call_kwargs["user_agent"] == "CMPBot/1.0" + @pytest.mark.asyncio(loop_scope="session") + async def test_two_pass_cookie_collection_merges_delayed(self): + """Cookies appearing only in the second CDP pass are still discovered.""" + first_pass = [ + {"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"}, + ] + second_pass = [ + {"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"}, + {"name": "_gid", "domain": ".example.com", "value": "GID.99"}, + ] + + page = _make_mock_page() + context = _make_mock_context(page, cookies=first_pass, delayed_cookies=second_pass) + browser = _make_mock_browser(context) + + crawler = CookieCrawler() + result = await crawler._crawl_page(browser, "https://example.com/") + + cookie_names = [c.name for c in result.cookies if c.storage_type == "cookie"] + assert "_ga" in cookie_names + assert "_gid" in cookie_names + # _ga must not be duplicated + assert cookie_names.count("_ga") == 1 + + @pytest.mark.asyncio(loop_scope="session") + async def test_uses_networkidle_wait(self): + """page.goto must use wait_until='networkidle'.""" + page = _make_mock_page() + context = _make_mock_context(page) + browser = _make_mock_browser(context) + + crawler = CookieCrawler() + await crawler._crawl_page(browser, "https://example.com/") + + page.goto.assert_awaited_once() + call_kwargs = page.goto.call_args[1] + assert call_kwargs.get("wait_until") == "networkidle" + # ── CookieCrawler.crawl_site ─────────────────────────────────────────── @@ -457,7 +519,9 @@ class TestBuildConsentCookie: """``url`` lets Playwright derive domain / path / secure.""" cookie = _build_consent_cookie("https://example.com/page") assert cookie["url"] == "https://example.com/page" - assert cookie["path"] == "/" + # ``path`` is NOT set explicitly — Playwright derives it from ``url``. + # Setting both would cause ``add_cookies`` to reject the cookie. + assert "path" not in cookie def test_cookie_value_decodes_to_consent_state_with_all_categories(self): import json as _json diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index b646144..7256d2f 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -27,7 +27,7 @@ services: command: - "sh" - "-c" - - "python -m alembic upgrade head && python -m src.cli.bootstrap_admin" + - "python -m alembic upgrade head && python -m src.cli.bootstrap_admin && python -m src.cli.seed_known_cookies" restart: "no" depends_on: postgres: