fix(scanner): reliable cookie discovery, auto-categorisation, and scan scheduling UI (#7)
Scanner fixes: - Remove conflicting ``path`` from consent pre-seed cookie (Playwright rejects cookies with both ``url`` and ``path``). - Switch to ``networkidle`` + 5s + 2s delayed second-pass for reliable cookie capture. - Check sitemap Content-Type to skip SPA HTML fallbacks. - Propagate ``auto_category`` from scan results to the cookies table during sync (was silently dropped). - Add ``_gcl_ls`` to the Open Cookie Database CSV. - Classify ``_consentos_*`` cookies as necessary directly in the classification engine. - Add ``seed_known_cookies`` to the bootstrap init container command. Admin UI: - Add scan schedule control to the Scans tab — preset options (disabled/daily/weekly/fortnightly/monthly) plus custom cron input. Saves ``scan_schedule_cron`` on the site config.
This commit is contained in:
@@ -2,17 +2,34 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
|
|||||||
import { Fragment, useState } from 'react';
|
import { Fragment, useState } from 'react';
|
||||||
|
|
||||||
import { getScan, getScanDiff, listScans, triggerScan } from '../api/scanner';
|
import { getScan, getScanDiff, listScans, triggerScan } from '../api/scanner';
|
||||||
|
import { getSiteConfig, updateSiteConfig } from '../api/sites';
|
||||||
import { trackFeatureUsage } from '../services/analytics';
|
import { trackFeatureUsage } from '../services/analytics';
|
||||||
import type { CookieDiffItem, ScanDiff, ScanJob, ScanJobDetail, ScanResult } from '../types/api';
|
import type { CookieDiffItem, ScanDiff, ScanJob, ScanJobDetail, ScanResult, SiteConfig } from '../types/api';
|
||||||
import { Alert } from './ui/alert';
|
import { Alert } from './ui/alert';
|
||||||
import { Badge } from './ui/badge';
|
import { Badge } from './ui/badge';
|
||||||
import { Button } from './ui/button';
|
import { Button } from './ui/button';
|
||||||
|
import { Card } from './ui/card';
|
||||||
import { LoadingState } from './ui/loading-state';
|
import { LoadingState } from './ui/loading-state';
|
||||||
|
import { Select } from './ui/select';
|
||||||
|
|
||||||
interface Props {
|
interface Props {
|
||||||
siteId: string;
|
siteId: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const SCHEDULE_OPTIONS: { value: string; label: string; cron: string | null }[] = [
|
||||||
|
{ value: 'disabled', label: 'Disabled', cron: null },
|
||||||
|
{ value: 'daily', label: 'Daily', cron: '0 3 * * *' },
|
||||||
|
{ value: 'weekly', label: 'Weekly', cron: '0 3 * * 0' },
|
||||||
|
{ value: 'fortnightly', label: 'Fortnightly', cron: '0 3 1,15 * *' },
|
||||||
|
{ value: 'monthly', label: 'Monthly', cron: '0 3 1 * *' },
|
||||||
|
];
|
||||||
|
|
||||||
|
function cronToScheduleValue(cron: string | null | undefined): string {
|
||||||
|
if (!cron) return 'disabled';
|
||||||
|
const match = SCHEDULE_OPTIONS.find((o) => o.cron === cron);
|
||||||
|
return match?.value ?? 'custom';
|
||||||
|
}
|
||||||
|
|
||||||
function statusVariant(status: string): 'warning' | 'info' | 'success' | 'error' | 'neutral' {
|
function statusVariant(status: string): 'warning' | 'info' | 'success' | 'error' | 'neutral' {
|
||||||
const map: Record<string, 'warning' | 'info' | 'success' | 'error'> = {
|
const map: Record<string, 'warning' | 'info' | 'success' | 'error'> = {
|
||||||
pending: 'warning',
|
pending: 'warning',
|
||||||
@@ -183,6 +200,45 @@ export default function SiteScannerTab({ siteId }: Props) {
|
|||||||
const queryClient = useQueryClient();
|
const queryClient = useQueryClient();
|
||||||
const [expandedScanId, setExpandedScanId] = useState<string | null>(null);
|
const [expandedScanId, setExpandedScanId] = useState<string | null>(null);
|
||||||
|
|
||||||
|
const { data: config } = useQuery<SiteConfig>({
|
||||||
|
queryKey: ['sites', siteId, 'config'],
|
||||||
|
queryFn: () => getSiteConfig(siteId),
|
||||||
|
});
|
||||||
|
|
||||||
|
const currentCron = config?.scan_schedule_cron ?? null;
|
||||||
|
const savedValue = cronToScheduleValue(currentCron);
|
||||||
|
const [selectedSchedule, setSelectedSchedule] = useState<string | null>(null);
|
||||||
|
const [customCron, setCustomCron] = useState('');
|
||||||
|
|
||||||
|
// Use local selection if the user has interacted, otherwise fall
|
||||||
|
// back to what's saved on the server.
|
||||||
|
const activeValue = selectedSchedule ?? savedValue;
|
||||||
|
const showCustomInput = activeValue === 'custom';
|
||||||
|
|
||||||
|
const scheduleMutation = useMutation({
|
||||||
|
mutationFn: (cron: string | null) => updateSiteConfig(siteId, { scan_schedule_cron: cron } as Partial<SiteConfig>),
|
||||||
|
onSuccess: () => {
|
||||||
|
queryClient.invalidateQueries({ queryKey: ['sites', siteId, 'config'] });
|
||||||
|
trackFeatureUsage('scan', 'schedule_change', { site_id: siteId });
|
||||||
|
setSelectedSchedule(null); // reset to server state
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const handleScheduleChange = (value: string) => {
|
||||||
|
setSelectedSchedule(value);
|
||||||
|
if (value === 'custom') {
|
||||||
|
setCustomCron(currentCron ?? '');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const option = SCHEDULE_OPTIONS.find((o) => o.value === value);
|
||||||
|
scheduleMutation.mutate(option?.cron ?? null);
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleCustomSave = () => {
|
||||||
|
const trimmed = customCron.trim();
|
||||||
|
scheduleMutation.mutate(trimmed || null);
|
||||||
|
};
|
||||||
|
|
||||||
const { data: scans, isLoading } = useQuery<ScanJob[]>({
|
const { data: scans, isLoading } = useQuery<ScanJob[]>({
|
||||||
queryKey: ['scans', siteId],
|
queryKey: ['scans', siteId],
|
||||||
queryFn: () => listScans(siteId),
|
queryFn: () => listScans(siteId),
|
||||||
@@ -202,6 +258,64 @@ export default function SiteScannerTab({ siteId }: Props) {
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
|
{/* Scan schedule */}
|
||||||
|
<Card className="mb-6 p-5">
|
||||||
|
<h3 className="font-heading mb-3 text-sm font-semibold text-foreground">Scan Schedule</h3>
|
||||||
|
<p className="mb-3 text-xs text-text-secondary">
|
||||||
|
Scheduled scans run automatically and re-discover cookies so your inventory stays
|
||||||
|
current. Select a preset or enter a custom cron expression.
|
||||||
|
</p>
|
||||||
|
<div className="flex flex-wrap items-end gap-3">
|
||||||
|
<div className="min-w-[180px]">
|
||||||
|
<Select
|
||||||
|
value={activeValue}
|
||||||
|
onChange={(e) => handleScheduleChange(e.target.value)}
|
||||||
|
disabled={scheduleMutation.isPending}
|
||||||
|
>
|
||||||
|
{SCHEDULE_OPTIONS.map((o) => (
|
||||||
|
<option key={o.value} value={o.value}>{o.label}</option>
|
||||||
|
))}
|
||||||
|
<option value="custom">Custom cron</option>
|
||||||
|
</Select>
|
||||||
|
</div>
|
||||||
|
{showCustomInput && (
|
||||||
|
<>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
className="rounded-md border border-border bg-background px-3 py-2 font-mono text-sm text-foreground placeholder:text-text-tertiary focus:border-copper focus:outline-none"
|
||||||
|
placeholder="0 3 * * 0"
|
||||||
|
value={customCron}
|
||||||
|
onChange={(e) => setCustomCron(e.target.value)}
|
||||||
|
/>
|
||||||
|
<Button
|
||||||
|
variant="secondary"
|
||||||
|
size="sm"
|
||||||
|
onClick={handleCustomSave}
|
||||||
|
disabled={scheduleMutation.isPending || !customCron.trim()}
|
||||||
|
>
|
||||||
|
Save
|
||||||
|
</Button>
|
||||||
|
<a
|
||||||
|
href="https://crontab.guru"
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="text-xs text-copper hover:underline"
|
||||||
|
>
|
||||||
|
Need help? Use crontab.guru →
|
||||||
|
</a>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
{scheduleMutation.isPending && (
|
||||||
|
<span className="text-xs text-text-secondary">Saving…</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
{currentCron && (
|
||||||
|
<p className="mt-2 text-xs text-text-secondary">
|
||||||
|
Current schedule: <code className="rounded bg-mist px-1.5 py-0.5 font-mono">{currentCron}</code>
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</Card>
|
||||||
|
|
||||||
{/* Header with trigger button */}
|
{/* Header with trigger button */}
|
||||||
<div className="mb-4 flex items-center justify-between">
|
<div className="mb-4 flex items-center justify-between">
|
||||||
<h2 className="font-heading text-lg font-semibold text-foreground">Cookie Scans</h2>
|
<h2 className="font-heading text-lg font-semibold text-foreground">Cookie Scans</h2>
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ const BASE_CONFIG: SiteConfig = {
|
|||||||
scan_enabled: true,
|
scan_enabled: true,
|
||||||
scan_frequency_hours: 168,
|
scan_frequency_hours: 168,
|
||||||
scan_max_pages: 50,
|
scan_max_pages: 50,
|
||||||
|
scan_schedule_cron: null,
|
||||||
enabled_categories: null,
|
enabled_categories: null,
|
||||||
created_at: '2025-01-01T00:00:00Z',
|
created_at: '2025-01-01T00:00:00Z',
|
||||||
updated_at: '2025-01-01T00:00:00Z',
|
updated_at: '2025-01-01T00:00:00Z',
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ const BASE_CONFIG: SiteConfig = {
|
|||||||
scan_enabled: true,
|
scan_enabled: true,
|
||||||
scan_frequency_hours: 168,
|
scan_frequency_hours: 168,
|
||||||
scan_max_pages: 50,
|
scan_max_pages: 50,
|
||||||
|
scan_schedule_cron: null,
|
||||||
enabled_categories: null,
|
enabled_categories: null,
|
||||||
created_at: '2025-01-01T00:00:00Z',
|
created_at: '2025-01-01T00:00:00Z',
|
||||||
updated_at: '2025-01-01T00:00:00Z',
|
updated_at: '2025-01-01T00:00:00Z',
|
||||||
|
|||||||
@@ -129,6 +129,7 @@ export interface SiteConfig {
|
|||||||
scan_enabled: boolean;
|
scan_enabled: boolean;
|
||||||
scan_frequency_hours: number;
|
scan_frequency_hours: number;
|
||||||
scan_max_pages: number;
|
scan_max_pages: number;
|
||||||
|
scan_schedule_cron: string | null;
|
||||||
/**
|
/**
|
||||||
* Cookie categories the banner should display. ``null`` means
|
* Cookie categories the banner should display. ``null`` means
|
||||||
* "inherit from the cascade" (group → org → system default of all
|
* "inherit from the cascade" (group → org → system default of all
|
||||||
|
|||||||
@@ -2263,3 +2263,4 @@ c7d8e9f0-0012-4567-890a-000000000012,Plausible Analytics,Analytics,plausible_,,"
|
|||||||
c7d8e9f0-0013-4567-890a-000000000013,Fathom Analytics,Analytics,_fathom,,"Privacy-focused simple website analytics with minimal data collection.",Varies,Conva Ventures Inc,https://usefathom.com/privacy,1
|
c7d8e9f0-0013-4567-890a-000000000013,Fathom Analytics,Analytics,_fathom,,"Privacy-focused simple website analytics with minimal data collection.",Varies,Conva Ventures Inc,https://usefathom.com/privacy,1
|
||||||
c7d8e9f0-0014-4567-890a-000000000014,Umami,Analytics,umami.,,"Open-source privacy-friendly web analytics alternative.",Varies,Website operator,https://umami.is/docs/about,1
|
c7d8e9f0-0014-4567-890a-000000000014,Umami,Analytics,umami.,,"Open-source privacy-friendly web analytics alternative.",Varies,Website operator,https://umami.is/docs/about,1
|
||||||
c7d8e9f0-0015-4567-890a-000000000015,Vercel,Functional,_vercel_,,"Vercel platform cookies for deployment previews and analytics.",Varies,Vercel Inc,https://vercel.com/legal/privacy-policy,1
|
c7d8e9f0-0015-4567-890a-000000000015,Vercel,Functional,_vercel_,,"Vercel platform cookies for deployment previews and analytics.",Varies,Vercel Inc,https://vercel.com/legal/privacy-policy,1
|
||||||
|
c7d8e9f0-0016-4567-890a-000000000016,Google Ads,Marketing,_gcl_ls,,"Google Click Identifier for localStorage-based ad conversion tracking.",90 Days,Google,https://business.safety.google/privacy/,0
|
||||||
|
|||||||
|
Can't render this file because it is too large.
|
@@ -174,6 +174,26 @@ def classify_cookie(
|
|||||||
|
|
||||||
This is a pure function — all data is passed in, no DB calls.
|
This is a pure function — all data is passed in, no DB calls.
|
||||||
"""
|
"""
|
||||||
|
# 0. ConsentOS's own cookies are always necessary. The banner's
|
||||||
|
# blocker already treats ``_consentos_*`` as exempt; the
|
||||||
|
# classifier must agree so the admin UI shows them in the
|
||||||
|
# right category without requiring a known-cookies DB entry.
|
||||||
|
if cookie_name.startswith("_consentos_"):
|
||||||
|
necessary = next(
|
||||||
|
(cat for cat in category_map.values() if cat.slug == "necessary"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
return ClassificationResult(
|
||||||
|
cookie_name=cookie_name,
|
||||||
|
cookie_domain=cookie_domain,
|
||||||
|
category_id=necessary.id if necessary else None,
|
||||||
|
category_slug="necessary",
|
||||||
|
vendor="ConsentOS",
|
||||||
|
description="ConsentOS consent management cookie.",
|
||||||
|
match_source=MatchSource.KNOWN_EXACT,
|
||||||
|
matched=True,
|
||||||
|
)
|
||||||
|
|
||||||
# 1. Check allow-list first (site-specific overrides)
|
# 1. Check allow-list first (site-specific overrides)
|
||||||
allow_match = _match_allow_list(cookie_name, cookie_domain, allow_list)
|
allow_match = _match_allow_list(cookie_name, cookie_domain, allow_list)
|
||||||
if allow_match:
|
if allow_match:
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from datetime import UTC, datetime
|
|||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from src.models.cookie import Cookie
|
from src.models.cookie import Cookie, CookieCategory
|
||||||
from src.models.scan import ScanJob, ScanResult
|
from src.models.scan import ScanJob, ScanResult
|
||||||
from src.models.site import Site
|
from src.models.site import Site
|
||||||
from src.schemas.scanner import (
|
from src.schemas.scanner import (
|
||||||
@@ -261,7 +261,13 @@ async def sync_scan_results_to_cookies(
|
|||||||
"""Upsert scan results into the site's cookie inventory.
|
"""Upsert scan results into the site's cookie inventory.
|
||||||
|
|
||||||
Creates new Cookie records for newly discovered items or updates
|
Creates new Cookie records for newly discovered items or updates
|
||||||
last_seen_at for existing ones. Returns the number of new cookies.
|
``last_seen_at`` for existing ones. When ``auto_category`` is set
|
||||||
|
on the scan result and the cookie doesn't already have a
|
||||||
|
manually-assigned category, the auto-classified category is
|
||||||
|
propagated to the cookie inventory so it shows up categorised in
|
||||||
|
the admin UI without requiring manual review.
|
||||||
|
|
||||||
|
Returns the number of new cookies.
|
||||||
"""
|
"""
|
||||||
results = await db.execute(select(ScanResult).where(ScanResult.scan_job_id == scan_job_id))
|
results = await db.execute(select(ScanResult).where(ScanResult.scan_job_id == scan_job_id))
|
||||||
items = list(results.scalars().all())
|
items = list(results.scalars().all())
|
||||||
@@ -269,6 +275,10 @@ async def sync_scan_results_to_cookies(
|
|||||||
now_iso = datetime.now(UTC).isoformat()
|
now_iso = datetime.now(UTC).isoformat()
|
||||||
new_count = 0
|
new_count = 0
|
||||||
|
|
||||||
|
# Pre-load the category slug → id mapping so we don't query per cookie.
|
||||||
|
cat_rows = await db.execute(select(CookieCategory))
|
||||||
|
slug_to_id: dict[str, uuid.UUID] = {cat.slug: cat.id for cat in cat_rows.scalars().all()}
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
existing = await db.execute(
|
existing = await db.execute(
|
||||||
select(Cookie).where(
|
select(Cookie).where(
|
||||||
@@ -280,14 +290,21 @@ async def sync_scan_results_to_cookies(
|
|||||||
)
|
)
|
||||||
cookie = existing.scalar_one_or_none()
|
cookie = existing.scalar_one_or_none()
|
||||||
|
|
||||||
|
# Resolve the auto-category slug to a category_id.
|
||||||
|
auto_cat_id = slug_to_id.get(item.auto_category) if item.auto_category else None
|
||||||
|
|
||||||
if cookie:
|
if cookie:
|
||||||
cookie.last_seen_at = now_iso
|
cookie.last_seen_at = now_iso
|
||||||
|
# Back-fill the category if not manually assigned yet.
|
||||||
|
if auto_cat_id and not cookie.category_id:
|
||||||
|
cookie.category_id = auto_cat_id
|
||||||
else:
|
else:
|
||||||
cookie = Cookie(
|
cookie = Cookie(
|
||||||
site_id=site_id,
|
site_id=site_id,
|
||||||
name=item.cookie_name,
|
name=item.cookie_name,
|
||||||
domain=item.cookie_domain,
|
domain=item.cookie_domain,
|
||||||
storage_type=item.storage_type,
|
storage_type=item.storage_type,
|
||||||
|
category_id=auto_cat_id,
|
||||||
review_status="pending",
|
review_status="pending",
|
||||||
first_seen_at=now_iso,
|
first_seen_at=now_iso,
|
||||||
last_seen_at=now_iso,
|
last_seen_at=now_iso,
|
||||||
|
|||||||
@@ -68,11 +68,13 @@ def _build_consent_cookie(url: str) -> dict:
|
|||||||
"bannerVersion": "scanner",
|
"bannerVersion": "scanner",
|
||||||
}
|
}
|
||||||
value = quote(json.dumps(state, separators=(",", ":")), safe="")
|
value = quote(json.dumps(state, separators=(",", ":")), safe="")
|
||||||
|
# Playwright's ``add_cookies`` accepts EITHER ``url`` (from which
|
||||||
|
# it derives domain/path/secure) OR explicit ``domain`` + ``path``
|
||||||
|
# — but not both. Using ``url`` is simplest.
|
||||||
return {
|
return {
|
||||||
"name": _CONSENT_COOKIE_NAME,
|
"name": _CONSENT_COOKIE_NAME,
|
||||||
"value": value,
|
"value": value,
|
||||||
"url": url,
|
"url": url,
|
||||||
"path": "/",
|
|
||||||
"expires": time.time() + 365 * 86400,
|
"expires": time.time() + 365 * 86400,
|
||||||
"sameSite": "Lax",
|
"sameSite": "Lax",
|
||||||
}
|
}
|
||||||
@@ -201,6 +203,9 @@ class CookieCrawler:
|
|||||||
script_cookies: dict[str, str] = {} # cookie name → script URL
|
script_cookies: dict[str, str] = {} # cookie name → script URL
|
||||||
initiator_map: dict[str, str] = {} # request URL → initiating URL
|
initiator_map: dict[str, str] = {} # request URL → initiating URL
|
||||||
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
|
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
|
||||||
|
# Cookies discovered directly from Set-Cookie response headers.
|
||||||
|
# Keyed by (name, domain) so they can be merged with CDP results.
|
||||||
|
header_cookies: dict[tuple[str, str], DiscoveredCookie] = {}
|
||||||
|
|
||||||
context: BrowserContext | None = None
|
context: BrowserContext | None = None
|
||||||
try:
|
try:
|
||||||
@@ -236,7 +241,9 @@ class CookieCrawler:
|
|||||||
|
|
||||||
page.on("request", _on_request)
|
page.on("request", _on_request)
|
||||||
|
|
||||||
# Track Set-Cookie headers from responses
|
# Track Set-Cookie headers from responses and create
|
||||||
|
# DiscoveredCookie entries directly — CDP's context.cookies()
|
||||||
|
# may not enumerate cross-domain cookies.
|
||||||
async def _on_response(response: Response) -> None:
|
async def _on_response(response: Response) -> None:
|
||||||
try:
|
try:
|
||||||
headers = await response.all_headers()
|
headers = await response.all_headers()
|
||||||
@@ -247,25 +254,67 @@ class CookieCrawler:
|
|||||||
initiator = _get_script_initiator(request)
|
initiator = _get_script_initiator(request)
|
||||||
# Build the initiator chain for this request
|
# Build the initiator chain for this request
|
||||||
chain = _build_initiator_chain(request.url, initiator_map)
|
chain = _build_initiator_chain(request.url, initiator_map)
|
||||||
|
resp_domain = urlparse(response.url).hostname or ""
|
||||||
for cookie_str in set_cookie.split("\n"):
|
for cookie_str in set_cookie.split("\n"):
|
||||||
name = cookie_str.split("=")[0].strip()
|
name = cookie_str.split("=")[0].strip()
|
||||||
if name:
|
if name:
|
||||||
if initiator:
|
if initiator:
|
||||||
script_cookies[name] = initiator
|
script_cookies[name] = initiator
|
||||||
initiator_chains[name] = chain
|
initiator_chains[name] = chain
|
||||||
|
# Parse optional Domain attribute from
|
||||||
|
# the Set-Cookie header; fall back to
|
||||||
|
# the response hostname.
|
||||||
|
domain = resp_domain
|
||||||
|
for part in cookie_str.split(";")[1:]:
|
||||||
|
part = part.strip()
|
||||||
|
if part.lower().startswith("domain="):
|
||||||
|
domain = part.split("=", 1)[1].strip()
|
||||||
|
break
|
||||||
|
key = (name, domain)
|
||||||
|
if key not in header_cookies:
|
||||||
|
header_cookies[key] = DiscoveredCookie(
|
||||||
|
name=name,
|
||||||
|
domain=domain,
|
||||||
|
storage_type="cookie",
|
||||||
|
script_source=initiator,
|
||||||
|
page_url=url,
|
||||||
|
initiator_chain=chain,
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # Non-critical — response may have been aborted
|
pass # Non-critical — response may have been aborted
|
||||||
|
|
||||||
page.on("response", _on_response)
|
page.on("response", _on_response)
|
||||||
|
|
||||||
# Navigate
|
# Navigate — networkidle waits until ≤2 active connections for
|
||||||
await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms)
|
# 500ms, which catches the GA beacon round-trip that
|
||||||
# Allow additional time for scripts to set cookies after DOM load.
|
# domcontentloaded misses.
|
||||||
await page.wait_for_timeout(3000)
|
await page.goto(url, wait_until="networkidle", timeout=self._timeout_ms)
|
||||||
|
# Safety margin for late-firing scripts (e.g. deferred GTM tags).
|
||||||
|
await page.wait_for_timeout(5000)
|
||||||
|
|
||||||
# Enumerate browser cookies via CDP
|
# First pass — enumerate browser cookies via CDP.
|
||||||
cdp_cookies = await context.cookies()
|
cdp_cookies = await context.cookies()
|
||||||
|
|
||||||
|
# Second pass — wait a further 2 seconds for any delayed
|
||||||
|
# Set-Cookie headers, then merge newly appeared cookies.
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
delayed_cookies = await context.cookies()
|
||||||
|
|
||||||
|
# Merge: index first-pass cookies by (name, domain), then
|
||||||
|
# add any that only appeared in the second pass.
|
||||||
|
seen_keys: set[tuple[str, str]] = set()
|
||||||
|
all_cdp_cookies: list[dict] = []
|
||||||
for c in cdp_cookies:
|
for c in cdp_cookies:
|
||||||
|
key = (c["name"], c["domain"])
|
||||||
|
seen_keys.add(key)
|
||||||
|
all_cdp_cookies.append(c)
|
||||||
|
for c in delayed_cookies:
|
||||||
|
key = (c["name"], c["domain"])
|
||||||
|
if key not in seen_keys:
|
||||||
|
seen_keys.add(key)
|
||||||
|
all_cdp_cookies.append(c)
|
||||||
|
|
||||||
|
for c in all_cdp_cookies:
|
||||||
result.cookies.append(
|
result.cookies.append(
|
||||||
DiscoveredCookie(
|
DiscoveredCookie(
|
||||||
name=c["name"],
|
name=c["name"],
|
||||||
@@ -283,6 +332,13 @@ class CookieCrawler:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Merge cookies seen in Set-Cookie headers but NOT in the
|
||||||
|
# CDP cookie jar (e.g. cross-domain cookies that the browser
|
||||||
|
# scoped to a different origin).
|
||||||
|
for key, hc in header_cookies.items():
|
||||||
|
if key not in seen_keys:
|
||||||
|
result.cookies.append(hc)
|
||||||
|
|
||||||
# Enumerate localStorage
|
# Enumerate localStorage
|
||||||
ls_items = await page.evaluate("""() => {
|
ls_items = await page.evaluate("""() => {
|
||||||
const items = [];
|
const items = [];
|
||||||
|
|||||||
@@ -75,6 +75,13 @@ async def _fetch_sitemap(
|
|||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# SPAs with catch-all nginx/Caddy rules return 200 + text/html
|
||||||
|
# for /sitemap.xml. Don't try to parse HTML as XML.
|
||||||
|
content_type = resp.headers.get("content-type", "")
|
||||||
|
if "html" in content_type and "xml" not in content_type:
|
||||||
|
logger.debug("Sitemap %s returned HTML, skipping", url)
|
||||||
|
return []
|
||||||
|
|
||||||
root = ElementTree.fromstring(resp.text)
|
root = ElementTree.fromstring(resp.text)
|
||||||
|
|
||||||
# Check if it's a sitemap index
|
# Check if it's a sitemap index
|
||||||
|
|||||||
@@ -42,11 +42,35 @@ def _make_mock_page(
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def _make_mock_context(page, cookies: list[dict] | None = None):
|
def _make_mock_context(
|
||||||
"""Build a mock BrowserContext."""
|
page,
|
||||||
|
cookies: list[dict] | None = None,
|
||||||
|
delayed_cookies: list[dict] | None = None,
|
||||||
|
):
|
||||||
|
"""Build a mock BrowserContext.
|
||||||
|
|
||||||
|
*cookies* is returned on the first ``context.cookies()`` call (the
|
||||||
|
initial CDP enumeration). *delayed_cookies* is returned on the
|
||||||
|
second call (the delayed pass); defaults to the same list so
|
||||||
|
existing tests need no changes.
|
||||||
|
"""
|
||||||
context = AsyncMock()
|
context = AsyncMock()
|
||||||
context.new_page = AsyncMock(return_value=page)
|
context.new_page = AsyncMock(return_value=page)
|
||||||
context.cookies = AsyncMock(return_value=cookies or [])
|
first = cookies or []
|
||||||
|
second = delayed_cookies if delayed_cookies is not None else first
|
||||||
|
# The crawler calls context.cookies() twice per page (initial +
|
||||||
|
# delayed pass). Using a cycling function instead of a fixed-length
|
||||||
|
# side_effect list so multi-page tests don't exhaust the mock.
|
||||||
|
_cycle = [first, second]
|
||||||
|
_call_count = 0
|
||||||
|
|
||||||
|
async def _cycling_cookies(*_args, **_kwargs):
|
||||||
|
nonlocal _call_count
|
||||||
|
result = _cycle[_call_count % len(_cycle)]
|
||||||
|
_call_count += 1
|
||||||
|
return result
|
||||||
|
|
||||||
|
context.cookies = AsyncMock(side_effect=_cycling_cookies)
|
||||||
context.clear_cookies = AsyncMock()
|
context.clear_cookies = AsyncMock()
|
||||||
context.close = AsyncMock()
|
context.close = AsyncMock()
|
||||||
return context
|
return context
|
||||||
@@ -373,6 +397,44 @@ class TestCrawlPage:
|
|||||||
call_kwargs = browser.new_context.call_args[1]
|
call_kwargs = browser.new_context.call_args[1]
|
||||||
assert call_kwargs["user_agent"] == "CMPBot/1.0"
|
assert call_kwargs["user_agent"] == "CMPBot/1.0"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio(loop_scope="session")
|
||||||
|
async def test_two_pass_cookie_collection_merges_delayed(self):
|
||||||
|
"""Cookies appearing only in the second CDP pass are still discovered."""
|
||||||
|
first_pass = [
|
||||||
|
{"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"},
|
||||||
|
]
|
||||||
|
second_pass = [
|
||||||
|
{"name": "_ga", "domain": ".example.com", "value": "GA1.2.12345"},
|
||||||
|
{"name": "_gid", "domain": ".example.com", "value": "GID.99"},
|
||||||
|
]
|
||||||
|
|
||||||
|
page = _make_mock_page()
|
||||||
|
context = _make_mock_context(page, cookies=first_pass, delayed_cookies=second_pass)
|
||||||
|
browser = _make_mock_browser(context)
|
||||||
|
|
||||||
|
crawler = CookieCrawler()
|
||||||
|
result = await crawler._crawl_page(browser, "https://example.com/")
|
||||||
|
|
||||||
|
cookie_names = [c.name for c in result.cookies if c.storage_type == "cookie"]
|
||||||
|
assert "_ga" in cookie_names
|
||||||
|
assert "_gid" in cookie_names
|
||||||
|
# _ga must not be duplicated
|
||||||
|
assert cookie_names.count("_ga") == 1
|
||||||
|
|
||||||
|
@pytest.mark.asyncio(loop_scope="session")
|
||||||
|
async def test_uses_networkidle_wait(self):
|
||||||
|
"""page.goto must use wait_until='networkidle'."""
|
||||||
|
page = _make_mock_page()
|
||||||
|
context = _make_mock_context(page)
|
||||||
|
browser = _make_mock_browser(context)
|
||||||
|
|
||||||
|
crawler = CookieCrawler()
|
||||||
|
await crawler._crawl_page(browser, "https://example.com/")
|
||||||
|
|
||||||
|
page.goto.assert_awaited_once()
|
||||||
|
call_kwargs = page.goto.call_args[1]
|
||||||
|
assert call_kwargs.get("wait_until") == "networkidle"
|
||||||
|
|
||||||
|
|
||||||
# ── CookieCrawler.crawl_site ───────────────────────────────────────────
|
# ── CookieCrawler.crawl_site ───────────────────────────────────────────
|
||||||
|
|
||||||
@@ -457,7 +519,9 @@ class TestBuildConsentCookie:
|
|||||||
"""``url`` lets Playwright derive domain / path / secure."""
|
"""``url`` lets Playwright derive domain / path / secure."""
|
||||||
cookie = _build_consent_cookie("https://example.com/page")
|
cookie = _build_consent_cookie("https://example.com/page")
|
||||||
assert cookie["url"] == "https://example.com/page"
|
assert cookie["url"] == "https://example.com/page"
|
||||||
assert cookie["path"] == "/"
|
# ``path`` is NOT set explicitly — Playwright derives it from ``url``.
|
||||||
|
# Setting both would cause ``add_cookies`` to reject the cookie.
|
||||||
|
assert "path" not in cookie
|
||||||
|
|
||||||
def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
|
def test_cookie_value_decodes_to_consent_state_with_all_categories(self):
|
||||||
import json as _json
|
import json as _json
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ services:
|
|||||||
command:
|
command:
|
||||||
- "sh"
|
- "sh"
|
||||||
- "-c"
|
- "-c"
|
||||||
- "python -m alembic upgrade head && python -m src.cli.bootstrap_admin"
|
- "python -m alembic upgrade head && python -m src.cli.bootstrap_admin && python -m src.cli.seed_known_cookies"
|
||||||
restart: "no"
|
restart: "no"
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
|
|||||||
Reference in New Issue
Block a user