feat: initial public release
ConsentOS — a privacy-first cookie consent management platform. Self-hosted, source-available alternative to OneTrust, Cookiebot, and CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant architecture with role-based access, configuration cascade (system → org → group → site → region), dark-pattern detection in the scanner, and a tamper-evident consent record audit trail. This is the initial public release. Prior development history is retained internally. See README.md for the feature list, architecture overview, and quick-start instructions. Licensed under the Elastic Licence 2.0 — self-host freely; do not resell as a managed service.
This commit is contained in:
0
apps/scanner/src/__init__.py
Normal file
0
apps/scanner/src/__init__.py
Normal file
107
apps/scanner/src/classifier.py
Normal file
107
apps/scanner/src/classifier.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""Cookie classification based on known patterns.
|
||||
|
||||
Matches discovered cookies against a database of known cookie patterns
|
||||
to auto-categorise them (analytics, marketing, functional, etc.).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class KnownPattern:
|
||||
"""A known cookie pattern for classification."""
|
||||
|
||||
name_pattern: str
|
||||
domain_pattern: str
|
||||
category: str
|
||||
vendor: str | None = None
|
||||
is_regex: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
"""Result of classifying a cookie."""
|
||||
|
||||
category: str | None
|
||||
vendor: str | None = None
|
||||
match_source: str = "unmatched" # exact | wildcard | regex | unmatched
|
||||
|
||||
|
||||
def classify_cookie(
|
||||
name: str,
|
||||
domain: str,
|
||||
patterns: list[KnownPattern],
|
||||
) -> ClassificationResult:
|
||||
"""Classify a cookie by matching against known patterns.
|
||||
|
||||
Matching priority:
|
||||
1. Exact name match
|
||||
2. Wildcard match (patterns containing *)
|
||||
3. Regex match (patterns flagged as regex)
|
||||
"""
|
||||
for pattern in patterns:
|
||||
if pattern.is_regex:
|
||||
continue # Skip regex in first pass
|
||||
|
||||
if "*" in pattern.name_pattern:
|
||||
# Wildcard match
|
||||
regex = pattern.name_pattern.replace(".", r"\.").replace("*", ".*")
|
||||
if re.match(f"^{regex}$", name, re.IGNORECASE):
|
||||
if _domain_matches(domain, pattern.domain_pattern):
|
||||
return ClassificationResult(
|
||||
category=pattern.category,
|
||||
vendor=pattern.vendor,
|
||||
match_source="wildcard",
|
||||
)
|
||||
elif pattern.name_pattern == name:
|
||||
# Exact match
|
||||
if _domain_matches(domain, pattern.domain_pattern):
|
||||
return ClassificationResult(
|
||||
category=pattern.category,
|
||||
vendor=pattern.vendor,
|
||||
match_source="exact",
|
||||
)
|
||||
|
||||
# Regex pass
|
||||
for pattern in patterns:
|
||||
if not pattern.is_regex:
|
||||
continue
|
||||
try:
|
||||
if re.match(pattern.name_pattern, name, re.IGNORECASE):
|
||||
if _domain_matches(domain, pattern.domain_pattern):
|
||||
return ClassificationResult(
|
||||
category=pattern.category,
|
||||
vendor=pattern.vendor,
|
||||
match_source="regex",
|
||||
)
|
||||
except re.error:
|
||||
continue
|
||||
|
||||
return ClassificationResult(category=None, match_source="unmatched")
|
||||
|
||||
|
||||
def _domain_matches(actual: str, pattern: str) -> bool:
|
||||
"""Check if a domain matches a pattern.
|
||||
|
||||
Patterns can be:
|
||||
- "*" — matches any domain
|
||||
- ".example.com" — matches example.com and *.example.com
|
||||
- "example.com" — exact match
|
||||
"""
|
||||
if pattern == "*":
|
||||
return True
|
||||
|
||||
actual = actual.lower().lstrip(".")
|
||||
pattern = pattern.lower().lstrip(".")
|
||||
|
||||
if actual == pattern:
|
||||
return True
|
||||
|
||||
# Subdomain match: actual "sub.example.com" matches pattern "example.com"
|
||||
if actual.endswith(f".{pattern}"):
|
||||
return True
|
||||
|
||||
return False
|
||||
280
apps/scanner/src/consent_validator.py
Normal file
280
apps/scanner/src/consent_validator.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""Consent signal validation — Playwright-based runtime checks.
|
||||
|
||||
Validates that consent signals (GCM, TCF, GPP) work correctly at runtime
|
||||
by checking pre-consent, post-accept, and post-reject states.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Known tracker domains for pixel-fire detection
|
||||
KNOWN_TRACKER_DOMAINS = frozenset(
|
||||
{
|
||||
"google-analytics.com",
|
||||
"googletagmanager.com",
|
||||
"doubleclick.net",
|
||||
"facebook.net",
|
||||
"facebook.com",
|
||||
"connect.facebook.net",
|
||||
"analytics.tiktok.com",
|
||||
"snap.licdn.com",
|
||||
"bat.bing.com",
|
||||
"clarity.ms",
|
||||
"hotjar.com",
|
||||
"mouseflow.com",
|
||||
"cdn.segment.com",
|
||||
"cdn.mxpnl.com",
|
||||
"plausible.io",
|
||||
"px.ads.linkedin.com",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConsentSignalState:
|
||||
"""Captured consent signal state from the page."""
|
||||
|
||||
gcm_state: dict | None = None
|
||||
tcf_data: dict | None = None
|
||||
gpp_data: dict | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationIssue:
|
||||
"""A single consent validation issue."""
|
||||
|
||||
check: str
|
||||
severity: str # critical, warning, info
|
||||
message: str
|
||||
recommendation: str
|
||||
details: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""Result of consent signal validation for a page."""
|
||||
|
||||
url: str
|
||||
pre_consent_issues: list[ValidationIssue] = field(default_factory=list)
|
||||
post_accept_issues: list[ValidationIssue] = field(default_factory=list)
|
||||
post_reject_issues: list[ValidationIssue] = field(default_factory=list)
|
||||
error: str | None = None
|
||||
|
||||
@property
|
||||
def all_issues(self) -> list[ValidationIssue]:
|
||||
return self.pre_consent_issues + self.post_accept_issues + self.post_reject_issues
|
||||
|
||||
@property
|
||||
def has_issues(self) -> bool:
|
||||
return bool(self.all_issues)
|
||||
|
||||
|
||||
async def _get_consent_signals(page: Page) -> ConsentSignalState:
|
||||
"""Extract current consent signal state from the page."""
|
||||
state = ConsentSignalState()
|
||||
|
||||
# Read GCM state
|
||||
try:
|
||||
gcm = await page.evaluate("""() => {
|
||||
try {
|
||||
if (window.dataLayer) {
|
||||
const consentEvents = window.dataLayer.filter(
|
||||
e => e[0] === 'consent' || (e.event && e.event.includes('consent'))
|
||||
);
|
||||
return { dataLayer: consentEvents, available: true };
|
||||
}
|
||||
return { available: false };
|
||||
} catch (e) { return { error: e.message }; }
|
||||
}""")
|
||||
state.gcm_state = gcm
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Read TCF state
|
||||
try:
|
||||
tcf = await page.evaluate("""() => {
|
||||
return new Promise((resolve) => {
|
||||
if (typeof window.__tcfapi === 'function') {
|
||||
window.__tcfapi('getTCData', 2, (data, success) => {
|
||||
resolve({ available: true, success, data: data || null });
|
||||
});
|
||||
} else {
|
||||
resolve({ available: false });
|
||||
}
|
||||
});
|
||||
}""")
|
||||
state.tcf_data = tcf
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Read GPP state
|
||||
try:
|
||||
gpp = await page.evaluate("""() => {
|
||||
return new Promise((resolve) => {
|
||||
if (typeof window.__gpp === 'function') {
|
||||
window.__gpp('getGPPData', (data, success) => {
|
||||
resolve({ available: true, success, data: data || null });
|
||||
});
|
||||
} else {
|
||||
resolve({ available: false });
|
||||
}
|
||||
});
|
||||
}""")
|
||||
state.gpp_data = gpp
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return state
|
||||
|
||||
|
||||
async def _get_cookies_from_context(context: BrowserContext) -> list[dict]:
|
||||
"""Get all cookies from the browser context."""
|
||||
return await context.cookies()
|
||||
|
||||
|
||||
def _is_tracker_request(url: str) -> bool:
|
||||
"""Check if a URL belongs to a known tracker domain."""
|
||||
for domain in KNOWN_TRACKER_DOMAINS:
|
||||
if domain in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def validate_pre_consent(
|
||||
page: Page,
|
||||
context: BrowserContext,
|
||||
essential_cookie_names: set[str],
|
||||
tracker_requests: list[str],
|
||||
) -> list[ValidationIssue]:
|
||||
"""Validate that no non-essential activity occurs before consent."""
|
||||
issues: list[ValidationIssue] = []
|
||||
|
||||
# Check cookies — only essential should be set
|
||||
cookies = await _get_cookies_from_context(context)
|
||||
non_essential = [c for c in cookies if c["name"] not in essential_cookie_names]
|
||||
if non_essential:
|
||||
names = [c["name"] for c in non_essential]
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
check="pre_consent_cookies",
|
||||
severity="critical",
|
||||
message=(
|
||||
f"{len(non_essential)} non-essential cookie(s) set before consent: "
|
||||
f"{', '.join(names[:5])}"
|
||||
),
|
||||
recommendation=(
|
||||
"Ensure all non-essential cookies are blocked until consent is given."
|
||||
),
|
||||
details={"cookies": names},
|
||||
)
|
||||
)
|
||||
|
||||
# Check tracker requests
|
||||
tracker_hits = [url for url in tracker_requests if _is_tracker_request(url)]
|
||||
if tracker_hits:
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
check="pre_consent_trackers",
|
||||
severity="critical",
|
||||
message=f"{len(tracker_hits)} tracking request(s) fired before consent.",
|
||||
recommendation="Block all tracking scripts until the user grants consent.",
|
||||
details={"tracker_urls": tracker_hits[:10]},
|
||||
)
|
||||
)
|
||||
|
||||
# Check GCM defaults
|
||||
signals = await _get_consent_signals(page)
|
||||
if signals.gcm_state and signals.gcm_state.get("available"):
|
||||
# GCM should show denied for non-essential types
|
||||
pass # GCM state captured for reporting
|
||||
|
||||
# Check TCF — no purpose consents should be active
|
||||
if signals.tcf_data and signals.tcf_data.get("available"):
|
||||
tcf_data = signals.tcf_data.get("data") or {}
|
||||
purpose_consents = tcf_data.get("purpose", {}).get("consents", {})
|
||||
granted_purposes = [k for k, v in purpose_consents.items() if v]
|
||||
if granted_purposes:
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
check="pre_consent_tcf",
|
||||
severity="critical",
|
||||
message=f"TCF purpose consents active before user action: {granted_purposes}",
|
||||
recommendation="TCF should report no purpose consents until user grants them.",
|
||||
details={"granted_purposes": granted_purposes},
|
||||
)
|
||||
)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def validate_post_accept(
|
||||
page: Page,
|
||||
context: BrowserContext,
|
||||
) -> list[ValidationIssue]:
|
||||
"""Validate consent signals after Accept All is clicked."""
|
||||
issues: list[ValidationIssue] = []
|
||||
|
||||
signals = await _get_consent_signals(page)
|
||||
|
||||
# Check TCF — purposes should now be consented
|
||||
if signals.tcf_data and signals.tcf_data.get("available"):
|
||||
if not signals.tcf_data.get("success"):
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
check="post_accept_tcf",
|
||||
severity="warning",
|
||||
message="TCF getTCData returned unsuccessful after Accept All.",
|
||||
recommendation=("Verify TCF API returns valid TC data after consent."),
|
||||
)
|
||||
)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def validate_post_reject(
|
||||
page: Page,
|
||||
context: BrowserContext,
|
||||
essential_cookie_names: set[str],
|
||||
tracker_requests: list[str],
|
||||
) -> list[ValidationIssue]:
|
||||
"""Validate that rejection is respected — no tracking after reject."""
|
||||
issues: list[ValidationIssue] = []
|
||||
|
||||
# Check cookies after reject
|
||||
cookies = await _get_cookies_from_context(context)
|
||||
non_essential = [c for c in cookies if c["name"] not in essential_cookie_names]
|
||||
if non_essential:
|
||||
names = [c["name"] for c in non_essential]
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
check="post_reject_cookies",
|
||||
severity="critical",
|
||||
message=(
|
||||
f"{len(non_essential)} non-essential cookie(s) remain after rejection: "
|
||||
f"{', '.join(names[:5])}"
|
||||
),
|
||||
recommendation="Ensure all non-essential cookies are removed when user rejects.",
|
||||
details={"cookies": names},
|
||||
)
|
||||
)
|
||||
|
||||
# Check tracker requests after reject
|
||||
tracker_hits = [url for url in tracker_requests if _is_tracker_request(url)]
|
||||
if tracker_hits:
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
check="post_reject_trackers",
|
||||
severity="critical",
|
||||
message=f"{len(tracker_hits)} tracking request(s) fired after rejection.",
|
||||
recommendation="Ensure tracking scripts respect rejection and do not fire.",
|
||||
details={"tracker_urls": tracker_hits[:10]},
|
||||
)
|
||||
)
|
||||
|
||||
return issues
|
||||
335
apps/scanner/src/crawler.py
Normal file
335
apps/scanner/src/crawler.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Playwright-based headless browser cookie crawler.
|
||||
|
||||
For each URL: launches headless Chromium, clears cookies, navigates,
|
||||
waits for network idle, enumerates document.cookie / localStorage /
|
||||
sessionStorage, captures Set-Cookie headers from network requests,
|
||||
and attributes cookies to source scripts via the request chain.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
Page,
|
||||
Request,
|
||||
Response,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Realistic Chrome UA so sites don't block the crawler as a bot.
|
||||
_DEFAULT_USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscoveredCookie:
|
||||
"""A cookie or storage item found during a crawl."""
|
||||
|
||||
name: str
|
||||
domain: str
|
||||
storage_type: str = "cookie" # cookie | local_storage | session_storage
|
||||
path: str | None = None
|
||||
expires: float | None = None
|
||||
http_only: bool | None = None
|
||||
secure: bool | None = None
|
||||
same_site: str | None = None
|
||||
value_length: int = 0
|
||||
script_source: str | None = None
|
||||
page_url: str = ""
|
||||
initiator_chain: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
"""Result of crawling a single page."""
|
||||
|
||||
url: str
|
||||
cookies: list[DiscoveredCookie] = field(default_factory=list)
|
||||
error: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SiteCrawlResult:
|
||||
"""Aggregated result of crawling all pages on a site."""
|
||||
|
||||
domain: str
|
||||
pages: list[CrawlResult] = field(default_factory=list)
|
||||
total_cookies_found: int = 0
|
||||
|
||||
@property
|
||||
def unique_cookies(self) -> list[DiscoveredCookie]:
|
||||
"""Deduplicate cookies across pages by (name, domain, storage_type)."""
|
||||
seen: dict[tuple[str, str, str], DiscoveredCookie] = {}
|
||||
for page in self.pages:
|
||||
for cookie in page.cookies:
|
||||
key = (cookie.name, cookie.domain, cookie.storage_type)
|
||||
if key not in seen:
|
||||
seen[key] = cookie
|
||||
return list(seen.values())
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProxyConfig:
|
||||
"""Proxy configuration for geo-located scanning."""
|
||||
|
||||
server: str # e.g. "http://proxy-eu.example.com:8080"
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
|
||||
class CookieCrawler:
|
||||
"""Crawls a site using Playwright to discover cookies and storage items."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
headless: bool = True,
|
||||
timeout_ms: int = 30_000,
|
||||
user_agent: str = _DEFAULT_USER_AGENT,
|
||||
proxy: ProxyConfig | None = None,
|
||||
) -> None:
|
||||
self._headless = headless
|
||||
self._timeout_ms = timeout_ms
|
||||
self._user_agent = user_agent
|
||||
self._proxy = proxy
|
||||
|
||||
async def crawl_site(
|
||||
self,
|
||||
urls: list[str],
|
||||
*,
|
||||
max_pages: int = 50,
|
||||
) -> SiteCrawlResult:
|
||||
"""Crawl multiple URLs and aggregate cookie discoveries."""
|
||||
if not urls:
|
||||
return SiteCrawlResult(domain="")
|
||||
|
||||
domain = urlparse(urls[0]).hostname or ""
|
||||
result = SiteCrawlResult(domain=domain)
|
||||
|
||||
async with async_playwright() as pw:
|
||||
launch_kwargs: dict = {"headless": self._headless}
|
||||
if self._proxy:
|
||||
proxy_opts: dict = {"server": self._proxy.server}
|
||||
if self._proxy.username:
|
||||
proxy_opts["username"] = self._proxy.username
|
||||
if self._proxy.password:
|
||||
proxy_opts["password"] = self._proxy.password
|
||||
launch_kwargs["proxy"] = proxy_opts
|
||||
browser = await pw.chromium.launch(**launch_kwargs)
|
||||
try:
|
||||
for url in urls[:max_pages]:
|
||||
page_result = await self._crawl_page(browser, url)
|
||||
result.pages.append(page_result)
|
||||
result.total_cookies_found += len(page_result.cookies)
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
async def _crawl_page(
|
||||
self,
|
||||
browser: Browser, # noqa: F821
|
||||
url: str,
|
||||
) -> CrawlResult:
|
||||
"""Crawl a single page and discover cookies."""
|
||||
result = CrawlResult(url=url)
|
||||
script_cookies: dict[str, str] = {} # cookie name → script URL
|
||||
initiator_map: dict[str, str] = {} # request URL → initiating URL
|
||||
initiator_chains: dict[str, list[str]] = {} # cookie name → chain
|
||||
|
||||
context: BrowserContext | None = None
|
||||
try:
|
||||
context = await browser.new_context(
|
||||
user_agent=self._user_agent,
|
||||
ignore_https_errors=True,
|
||||
)
|
||||
# Clear all cookies before visiting
|
||||
await context.clear_cookies()
|
||||
|
||||
page: Page = await context.new_page()
|
||||
|
||||
# Track request initiator chains via frame URL and redirect chains
|
||||
def _on_request(request: Request) -> None:
|
||||
try:
|
||||
req_url = request.url
|
||||
# Follow redirect chain to find the original initiator
|
||||
redirected = request.redirected_from
|
||||
if redirected:
|
||||
initiator_map[req_url] = redirected.url
|
||||
else:
|
||||
# Use the frame URL as the parent initiator
|
||||
frame_url = request.frame.url if request.frame else ""
|
||||
if frame_url and frame_url != req_url:
|
||||
initiator_map[req_url] = frame_url
|
||||
except Exception:
|
||||
pass # Non-critical — request introspection may fail
|
||||
|
||||
page.on("request", _on_request)
|
||||
|
||||
# Track Set-Cookie headers from responses
|
||||
async def _on_response(response: Response) -> None:
|
||||
try:
|
||||
headers = await response.all_headers()
|
||||
set_cookie = headers.get("set-cookie", "")
|
||||
if set_cookie:
|
||||
# Attribute cookie to the initiating script
|
||||
request: Request = response.request
|
||||
initiator = _get_script_initiator(request)
|
||||
# Build the initiator chain for this request
|
||||
chain = _build_initiator_chain(request.url, initiator_map)
|
||||
for cookie_str in set_cookie.split("\n"):
|
||||
name = cookie_str.split("=")[0].strip()
|
||||
if name:
|
||||
if initiator:
|
||||
script_cookies[name] = initiator
|
||||
initiator_chains[name] = chain
|
||||
except Exception:
|
||||
pass # Non-critical — response may have been aborted
|
||||
|
||||
page.on("response", _on_response)
|
||||
|
||||
# Navigate
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=self._timeout_ms)
|
||||
# Allow additional time for scripts to set cookies after DOM load.
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# Enumerate browser cookies via CDP
|
||||
cdp_cookies = await context.cookies()
|
||||
for c in cdp_cookies:
|
||||
result.cookies.append(
|
||||
DiscoveredCookie(
|
||||
name=c["name"],
|
||||
domain=c["domain"],
|
||||
storage_type="cookie",
|
||||
path=c.get("path"),
|
||||
expires=c.get("expires"),
|
||||
http_only=c.get("httpOnly"),
|
||||
secure=c.get("secure"),
|
||||
same_site=c.get("sameSite"),
|
||||
value_length=len(c.get("value", "")),
|
||||
script_source=script_cookies.get(c["name"]),
|
||||
page_url=url,
|
||||
initiator_chain=initiator_chains.get(c["name"], []),
|
||||
)
|
||||
)
|
||||
|
||||
# Enumerate localStorage
|
||||
ls_items = await page.evaluate("""() => {
|
||||
const items = [];
|
||||
try {
|
||||
for (let i = 0; i < localStorage.length; i++) {
|
||||
const key = localStorage.key(i);
|
||||
if (key) {
|
||||
items.push({
|
||||
name: key,
|
||||
valueLength: (localStorage.getItem(key) || '').length,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (e) {}
|
||||
return items;
|
||||
}""")
|
||||
hostname = urlparse(url).hostname or ""
|
||||
for item in ls_items:
|
||||
result.cookies.append(
|
||||
DiscoveredCookie(
|
||||
name=item["name"],
|
||||
domain=hostname,
|
||||
storage_type="local_storage",
|
||||
value_length=item["valueLength"],
|
||||
page_url=url,
|
||||
)
|
||||
)
|
||||
|
||||
# Enumerate sessionStorage
|
||||
ss_items = await page.evaluate("""() => {
|
||||
const items = [];
|
||||
try {
|
||||
for (let i = 0; i < sessionStorage.length; i++) {
|
||||
const key = sessionStorage.key(i);
|
||||
if (key) {
|
||||
items.push({
|
||||
name: key,
|
||||
valueLength: (sessionStorage.getItem(key) || '').length,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (e) {}
|
||||
return items;
|
||||
}""")
|
||||
for item in ss_items:
|
||||
result.cookies.append(
|
||||
DiscoveredCookie(
|
||||
name=item["name"],
|
||||
domain=hostname,
|
||||
storage_type="session_storage",
|
||||
value_length=item["valueLength"],
|
||||
page_url=url,
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
result.error = str(exc)
|
||||
logger.warning("Failed to crawl %s: %s", url, exc)
|
||||
finally:
|
||||
if context:
|
||||
await context.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _get_script_initiator(request: Request) -> str | None:
|
||||
"""Walk the request chain to find the originating script URL.
|
||||
|
||||
Returns a single script URL for backwards compatibility. For the full
|
||||
initiator path, use :func:`_build_initiator_chain` instead.
|
||||
"""
|
||||
seen: set[str] = set()
|
||||
current = request
|
||||
while current:
|
||||
url = current.url
|
||||
if url in seen:
|
||||
break
|
||||
seen.add(url)
|
||||
if url.endswith(".js") or "javascript" in (current.resource_type or ""):
|
||||
return url
|
||||
redirected = current.redirected_from
|
||||
if redirected:
|
||||
current = redirected
|
||||
else:
|
||||
break
|
||||
return None
|
||||
|
||||
|
||||
def _build_initiator_chain(
|
||||
url: str,
|
||||
initiator_map: dict[str, str],
|
||||
max_depth: int = 20,
|
||||
) -> list[str]:
|
||||
"""Build the full initiator chain from a URL back to the root.
|
||||
|
||||
Walks the initiator map from *url* towards the top-level page,
|
||||
producing a list ordered root-first (i.e. the page URL at index 0
|
||||
and the leaf request URL at the end).
|
||||
"""
|
||||
chain = [url]
|
||||
seen: set[str] = {url}
|
||||
current = url
|
||||
for _ in range(max_depth):
|
||||
parent = initiator_map.get(current, "")
|
||||
if not parent or parent in seen:
|
||||
break
|
||||
chain.append(parent)
|
||||
seen.add(parent)
|
||||
current = parent
|
||||
chain.reverse() # Root first
|
||||
return chain
|
||||
348
apps/scanner/src/dark_pattern_detector.py
Normal file
348
apps/scanner/src/dark_pattern_detector.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""Dark pattern detection — CSS and DOM analysis of consent banners.
|
||||
|
||||
Detects manipulative UI patterns in cookie consent banners:
|
||||
- Unequal button prominence (Accept bigger/brighter than Reject)
|
||||
- Pre-ticked category checkboxes
|
||||
- Missing first-layer Reject button (CNIL violation)
|
||||
- Cookie walls (blocking page content)
|
||||
- Dismiss-on-scroll (not valid consent under GDPR)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from playwright.async_api import Page
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DarkPatternIssue:
|
||||
"""A detected dark pattern in the consent banner."""
|
||||
|
||||
pattern: str
|
||||
severity: str # critical, warning, info
|
||||
message: str
|
||||
recommendation: str
|
||||
details: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DarkPatternResult:
|
||||
"""Result of dark pattern analysis."""
|
||||
|
||||
url: str
|
||||
issues: list[DarkPatternIssue] = field(default_factory=list)
|
||||
banner_found: bool = False
|
||||
error: str | None = None
|
||||
|
||||
|
||||
# Common selectors for consent banner elements
|
||||
BANNER_SELECTORS = [
|
||||
"[id*='cookie']",
|
||||
"[id*='consent']",
|
||||
"[class*='cookie']",
|
||||
"[class*='consent']",
|
||||
"[id*='cmp']",
|
||||
"[class*='cmp']",
|
||||
"[role='dialog'][aria-label*='cookie' i]",
|
||||
"[role='dialog'][aria-label*='consent' i]",
|
||||
]
|
||||
|
||||
ACCEPT_BUTTON_SELECTORS = [
|
||||
"button:has-text('Accept')",
|
||||
"button:has-text('Accept All')",
|
||||
"button:has-text('Allow')",
|
||||
"button:has-text('Allow All')",
|
||||
"button:has-text('I Agree')",
|
||||
"button:has-text('OK')",
|
||||
"button:has-text('Got it')",
|
||||
"[data-action='accept']",
|
||||
"[id*='accept']",
|
||||
]
|
||||
|
||||
REJECT_BUTTON_SELECTORS = [
|
||||
"button:has-text('Reject')",
|
||||
"button:has-text('Reject All')",
|
||||
"button:has-text('Decline')",
|
||||
"button:has-text('Deny')",
|
||||
"button:has-text('Refuse')",
|
||||
"button:has-text('Tout refuser')",
|
||||
"[data-action='reject']",
|
||||
"[id*='reject']",
|
||||
]
|
||||
|
||||
|
||||
async def _find_banner(page: Page) -> bool:
|
||||
"""Check if a consent banner is visible on the page."""
|
||||
for selector in BANNER_SELECTORS:
|
||||
try:
|
||||
elements = await page.query_selector_all(selector)
|
||||
for el in elements:
|
||||
if await el.is_visible():
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
return False
|
||||
|
||||
|
||||
async def _find_button(page: Page, selectors: list[str]) -> dict | None:
|
||||
"""Find a visible button matching one of the selectors, return its computed styles."""
|
||||
for selector in selectors:
|
||||
try:
|
||||
elements = await page.query_selector_all(selector)
|
||||
for el in elements:
|
||||
if await el.is_visible():
|
||||
styles = await el.evaluate("""(el) => {
|
||||
const cs = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
return {
|
||||
width: rect.width,
|
||||
height: rect.height,
|
||||
area: rect.width * rect.height,
|
||||
backgroundColor: cs.backgroundColor,
|
||||
color: cs.color,
|
||||
fontSize: parseFloat(cs.fontSize),
|
||||
fontWeight: cs.fontWeight,
|
||||
padding: cs.padding,
|
||||
text: el.textContent.trim(),
|
||||
visible: true,
|
||||
};
|
||||
}""")
|
||||
return styles
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
async def check_button_prominence(page: Page) -> list[DarkPatternIssue]:
|
||||
"""Compare Accept and Reject button sizes and visual weight."""
|
||||
issues: list[DarkPatternIssue] = []
|
||||
|
||||
accept_btn = await _find_button(page, ACCEPT_BUTTON_SELECTORS)
|
||||
reject_btn = await _find_button(page, REJECT_BUTTON_SELECTORS)
|
||||
|
||||
if not accept_btn:
|
||||
return issues # No accept button found — nothing to compare
|
||||
|
||||
if not reject_btn:
|
||||
issues.append(
|
||||
DarkPatternIssue(
|
||||
pattern="missing_reject_button",
|
||||
severity="critical",
|
||||
message="No visible Reject/Decline button found on the first layer.",
|
||||
recommendation=(
|
||||
"Add a clearly visible 'Reject All' button on the first layer "
|
||||
"of the consent banner, as required by GDPR and CNIL."
|
||||
),
|
||||
)
|
||||
)
|
||||
return issues
|
||||
|
||||
# Compare button areas
|
||||
accept_area = accept_btn.get("area", 0)
|
||||
reject_area = reject_btn.get("area", 0)
|
||||
|
||||
if reject_area > 0 and accept_area > 0:
|
||||
ratio = accept_area / reject_area
|
||||
if ratio > 1.5:
|
||||
issues.append(
|
||||
DarkPatternIssue(
|
||||
pattern="unequal_button_size",
|
||||
severity="warning",
|
||||
message=(
|
||||
f"Accept button is {ratio:.1f}x larger than Reject button. "
|
||||
"Buttons should have equal prominence."
|
||||
),
|
||||
recommendation=(
|
||||
"Make the Accept and Reject buttons the same size and visual weight."
|
||||
),
|
||||
details={
|
||||
"accept_area": accept_area,
|
||||
"reject_area": reject_area,
|
||||
"ratio": round(ratio, 2),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
# Compare font sizes
|
||||
accept_font = accept_btn.get("fontSize", 0)
|
||||
reject_font = reject_btn.get("fontSize", 0)
|
||||
|
||||
if reject_font > 0 and accept_font > reject_font * 1.3:
|
||||
issues.append(
|
||||
DarkPatternIssue(
|
||||
pattern="unequal_font_size",
|
||||
severity="warning",
|
||||
message=(
|
||||
f"Accept button font ({accept_font}px) is larger than "
|
||||
f"Reject button font ({reject_font}px)."
|
||||
),
|
||||
recommendation="Use the same font size for both Accept and Reject buttons.",
|
||||
details={
|
||||
"accept_font_size": accept_font,
|
||||
"reject_font_size": reject_font,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def check_pre_ticked_boxes(page: Page) -> list[DarkPatternIssue]:
|
||||
"""Check for pre-ticked non-essential category checkboxes."""
|
||||
issues: list[DarkPatternIssue] = []
|
||||
|
||||
try:
|
||||
pre_ticked = await page.evaluate("""() => {
|
||||
const checkboxes = document.querySelectorAll(
|
||||
'input[type="checkbox"][checked], input[type="checkbox"]:checked'
|
||||
);
|
||||
const results = [];
|
||||
for (const cb of checkboxes) {
|
||||
// Skip if it looks like an "essential" checkbox (often disabled)
|
||||
if (cb.disabled) continue;
|
||||
const label = cb.closest('label')?.textContent?.trim()
|
||||
|| cb.getAttribute('aria-label')
|
||||
|| cb.name
|
||||
|| 'unknown';
|
||||
// Skip checkboxes that appear to be for essential/necessary
|
||||
const labelLower = label.toLowerCase();
|
||||
if (labelLower.includes('essential') || labelLower.includes('necessary')
|
||||
|| labelLower.includes('required') || labelLower.includes('strictly')) {
|
||||
continue;
|
||||
}
|
||||
results.push({ name: cb.name || cb.id, label: label });
|
||||
}
|
||||
return results;
|
||||
}""")
|
||||
|
||||
if pre_ticked:
|
||||
labels = [pt["label"][:50] for pt in pre_ticked]
|
||||
issues.append(
|
||||
DarkPatternIssue(
|
||||
pattern="pre_ticked_checkboxes",
|
||||
severity="critical",
|
||||
message=(
|
||||
f"{len(pre_ticked)} non-essential category checkbox(es) are pre-ticked: "
|
||||
f"{', '.join(labels[:3])}"
|
||||
),
|
||||
recommendation=(
|
||||
"Non-essential category checkboxes must default to unchecked. "
|
||||
"Pre-ticked boxes do not constitute valid consent under GDPR."
|
||||
),
|
||||
details={"checkboxes": pre_ticked},
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Pre-ticked checkbox check failed: %s", exc)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def check_cookie_wall(page: Page) -> list[DarkPatternIssue]:
|
||||
"""Check if a cookie wall blocks access to page content."""
|
||||
issues: list[DarkPatternIssue] = []
|
||||
|
||||
try:
|
||||
is_wall = await page.evaluate("""() => {
|
||||
// Check for full-screen overlays blocking content
|
||||
const overlays = document.querySelectorAll(
|
||||
'[class*="overlay"], [class*="modal"], [class*="wall"]'
|
||||
);
|
||||
for (const overlay of overlays) {
|
||||
const cs = window.getComputedStyle(overlay);
|
||||
const rect = overlay.getBoundingClientRect();
|
||||
// Full-viewport overlay with high z-index suggests a cookie wall
|
||||
if (rect.width >= window.innerWidth * 0.9
|
||||
&& rect.height >= window.innerHeight * 0.9
|
||||
&& parseInt(cs.zIndex) > 100) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check if body/main is hidden or has overflow hidden
|
||||
const body = document.body;
|
||||
const bodyStyle = window.getComputedStyle(body);
|
||||
if (bodyStyle.overflow === 'hidden' && bodyStyle.position === 'fixed') {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}""")
|
||||
|
||||
if is_wall:
|
||||
issues.append(
|
||||
DarkPatternIssue(
|
||||
pattern="cookie_wall",
|
||||
severity="critical",
|
||||
message="Cookie wall detected — page content appears blocked until consent.",
|
||||
recommendation=(
|
||||
"Remove the cookie wall. Users must be able to access the site "
|
||||
"without being forced to consent to non-essential cookies."
|
||||
),
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Cookie wall check failed: %s", exc)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def check_scroll_dismissal(page: Page) -> list[DarkPatternIssue]:
|
||||
"""Check if scrolling dismisses the consent banner (not valid consent)."""
|
||||
issues: list[DarkPatternIssue] = []
|
||||
|
||||
try:
|
||||
# Check if banner is visible before scroll
|
||||
banner_visible_before = await _find_banner(page)
|
||||
if not banner_visible_before:
|
||||
return issues
|
||||
|
||||
# Scroll down
|
||||
await page.evaluate("window.scrollBy(0, 500)")
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
# Check if banner disappeared
|
||||
banner_visible_after = await _find_banner(page)
|
||||
|
||||
if banner_visible_before and not banner_visible_after:
|
||||
issues.append(
|
||||
DarkPatternIssue(
|
||||
pattern="scroll_dismissal",
|
||||
severity="critical",
|
||||
message="Consent banner dismissed on scroll — this is not valid consent.",
|
||||
recommendation=(
|
||||
"Disable dismiss-on-scroll. Under GDPR, scrolling does not "
|
||||
"constitute valid consent. The banner must remain until the user "
|
||||
"makes an explicit choice."
|
||||
),
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Scroll dismissal check failed: %s", exc)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def detect_dark_patterns(page: Page) -> DarkPatternResult:
|
||||
"""Run all dark pattern checks on the current page."""
|
||||
url = page.url
|
||||
result = DarkPatternResult(url=url)
|
||||
|
||||
try:
|
||||
result.banner_found = await _find_banner(page)
|
||||
if not result.banner_found:
|
||||
return result
|
||||
|
||||
# Run all checks
|
||||
result.issues.extend(await check_button_prominence(page))
|
||||
result.issues.extend(await check_pre_ticked_boxes(page))
|
||||
result.issues.extend(await check_cookie_wall(page))
|
||||
result.issues.extend(await check_scroll_dismissal(page))
|
||||
|
||||
except Exception as exc:
|
||||
result.error = str(exc)
|
||||
logger.warning("Dark pattern detection failed for %s: %s", url, exc)
|
||||
|
||||
return result
|
||||
119
apps/scanner/src/sitemap.py
Normal file
119
apps/scanner/src/sitemap.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Sitemap parser for URL discovery.
|
||||
|
||||
Fetches and parses XML sitemaps (including sitemap indexes) to discover
|
||||
URLs for crawling. Falls back to common page paths if no sitemap exists.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# XML namespace used in sitemaps
|
||||
_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||
|
||||
# Common page paths to try when no sitemap is available
|
||||
_DEFAULT_PATHS = [
|
||||
"/",
|
||||
"/about",
|
||||
"/contact",
|
||||
"/privacy",
|
||||
"/privacy-policy",
|
||||
"/terms",
|
||||
"/cookie-policy",
|
||||
]
|
||||
|
||||
|
||||
async def discover_urls(
|
||||
domain: str,
|
||||
*,
|
||||
max_urls: int = 50,
|
||||
timeout: float = 10.0,
|
||||
) -> list[str]:
|
||||
"""Discover URLs for a domain via sitemap or fallback paths.
|
||||
|
||||
Attempts to fetch /sitemap.xml first. If that fails, tries
|
||||
/robots.txt for a Sitemap directive. Falls back to default paths.
|
||||
"""
|
||||
base = f"https://{domain}"
|
||||
urls: list[str] = []
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
verify=False, # noqa: S501 — scanning may target sites with self-signed certs
|
||||
) as client:
|
||||
# Try sitemap.xml
|
||||
sitemap_urls = await _fetch_sitemap(client, f"{base}/sitemap.xml", max_urls)
|
||||
if sitemap_urls:
|
||||
return sitemap_urls[:max_urls]
|
||||
|
||||
# Try robots.txt for Sitemap directive
|
||||
sitemap_url = await _find_sitemap_in_robots(client, f"{base}/robots.txt")
|
||||
if sitemap_url:
|
||||
sitemap_urls = await _fetch_sitemap(client, sitemap_url, max_urls)
|
||||
if sitemap_urls:
|
||||
return sitemap_urls[:max_urls]
|
||||
|
||||
# Fallback to default paths
|
||||
urls = [f"{base}{path}" for path in _DEFAULT_PATHS]
|
||||
return urls[:max_urls]
|
||||
|
||||
|
||||
async def _fetch_sitemap(
|
||||
client: httpx.AsyncClient,
|
||||
url: str,
|
||||
max_urls: int,
|
||||
) -> list[str]:
|
||||
"""Fetch and parse an XML sitemap. Handles sitemap indexes."""
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
root = ElementTree.fromstring(resp.text)
|
||||
|
||||
# Check if it's a sitemap index
|
||||
sitemaps = root.findall("sm:sitemap/sm:loc", _NS)
|
||||
if sitemaps:
|
||||
urls: list[str] = []
|
||||
for sm_loc in sitemaps:
|
||||
if sm_loc.text:
|
||||
child_urls = await _fetch_sitemap(client, sm_loc.text, max_urls - len(urls))
|
||||
urls.extend(child_urls)
|
||||
if len(urls) >= max_urls:
|
||||
break
|
||||
return urls[:max_urls]
|
||||
|
||||
# Regular sitemap — extract <loc> URLs
|
||||
locs = root.findall("sm:url/sm:loc", _NS)
|
||||
return [loc.text for loc in locs if loc.text][:max_urls]
|
||||
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to fetch sitemap %s: %s", url, exc)
|
||||
return []
|
||||
|
||||
|
||||
async def _find_sitemap_in_robots(
|
||||
client: httpx.AsyncClient,
|
||||
robots_url: str,
|
||||
) -> str | None:
|
||||
"""Look for a Sitemap directive in robots.txt."""
|
||||
try:
|
||||
resp = await client.get(robots_url)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
for line in resp.text.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.lower().startswith("sitemap:"):
|
||||
return stripped.split(":", 1)[1].strip()
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
379
apps/scanner/src/worker.py
Normal file
379
apps/scanner/src/worker.py
Normal file
@@ -0,0 +1,379 @@
|
||||
"""Scanner HTTP service.
|
||||
|
||||
Exposes an HTTP endpoint that accepts scan requests, runs the Playwright
|
||||
cookie crawler, and returns discovered cookies. Called by the API's Celery
|
||||
worker to execute scan jobs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Settings ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class ScannerSettings(BaseSettings):
|
||||
"""Scanner service settings from environment."""
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", case_sensitive=False)
|
||||
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8001
|
||||
log_level: str = "INFO"
|
||||
crawler_timeout_ms: int = 30_000
|
||||
crawler_headless: bool = True
|
||||
max_pages_per_scan: int = 50
|
||||
|
||||
|
||||
# ── Request / Response schemas ───────────────────────────────────────
|
||||
|
||||
|
||||
class ProxyRequest(BaseModel):
|
||||
"""Proxy configuration for geo-located scanning."""
|
||||
|
||||
server: str
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
|
||||
class ScanRequest(BaseModel):
|
||||
"""Incoming scan request from the API worker."""
|
||||
|
||||
domain: str
|
||||
urls: list[str] = Field(default_factory=list)
|
||||
max_pages: int = 50
|
||||
proxy: ProxyRequest | None = None
|
||||
|
||||
|
||||
class DiscoveredCookieResponse(BaseModel):
|
||||
"""A single cookie found during crawling."""
|
||||
|
||||
name: str
|
||||
domain: str
|
||||
storage_type: str = "cookie"
|
||||
path: str | None = None
|
||||
expires: float | None = None
|
||||
http_only: bool | None = None
|
||||
secure: bool | None = None
|
||||
same_site: str | None = None
|
||||
value_length: int = 0
|
||||
script_source: str | None = None
|
||||
page_url: str = ""
|
||||
initiator_chain: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ScanResponse(BaseModel):
|
||||
"""Result of a scan."""
|
||||
|
||||
domain: str
|
||||
pages_crawled: int
|
||||
total_cookies: int
|
||||
cookies: list[DiscoveredCookieResponse]
|
||||
errors: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ValidationRequest(BaseModel):
|
||||
"""Request for consent validation and dark pattern detection."""
|
||||
|
||||
url: str
|
||||
essential_cookie_names: list[str] = Field(default_factory=list)
|
||||
proxy: ProxyRequest | None = None
|
||||
|
||||
|
||||
class ValidationIssueResponse(BaseModel):
|
||||
"""A single validation issue."""
|
||||
|
||||
check: str
|
||||
severity: str
|
||||
message: str
|
||||
recommendation: str
|
||||
details: dict = Field(default_factory=dict)
|
||||
|
||||
|
||||
class DarkPatternIssueResponse(BaseModel):
|
||||
"""A detected dark pattern."""
|
||||
|
||||
pattern: str
|
||||
severity: str
|
||||
message: str
|
||||
recommendation: str
|
||||
details: dict = Field(default_factory=dict)
|
||||
|
||||
|
||||
class ValidationResponse(BaseModel):
|
||||
"""Result of consent validation and dark pattern detection."""
|
||||
|
||||
url: str
|
||||
pre_consent_issues: list[ValidationIssueResponse] = Field(default_factory=list)
|
||||
post_accept_issues: list[ValidationIssueResponse] = Field(default_factory=list)
|
||||
post_reject_issues: list[ValidationIssueResponse] = Field(default_factory=list)
|
||||
dark_pattern_issues: list[DarkPatternIssueResponse] = Field(default_factory=list)
|
||||
banner_found: bool = False
|
||||
errors: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
# ── Application ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def create_app(): # noqa: ANN201
|
||||
"""Create the scanner FastAPI application."""
|
||||
from fastapi import FastAPI, HTTPException
|
||||
|
||||
from src.crawler import CookieCrawler
|
||||
from src.sitemap import discover_urls
|
||||
|
||||
app = FastAPI(title="CMP Scanner Service", version="0.1.0")
|
||||
settings = ScannerSettings()
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.post("/scan", response_model=ScanResponse)
|
||||
async def run_scan(body: ScanRequest) -> ScanResponse:
|
||||
"""Execute a Playwright crawl and return discovered cookies."""
|
||||
# Discover URLs if none provided
|
||||
urls = body.urls
|
||||
if not urls:
|
||||
try:
|
||||
urls = await discover_urls(
|
||||
body.domain, max_urls=min(body.max_pages, settings.max_pages_per_scan)
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("URL discovery failed for %s: %s", body.domain, exc)
|
||||
urls = [f"https://{body.domain}/"]
|
||||
|
||||
if not urls:
|
||||
raise HTTPException(status_code=400, detail="No URLs to scan")
|
||||
|
||||
# Run crawler
|
||||
from src.crawler import ProxyConfig
|
||||
|
||||
proxy_config = None
|
||||
if body.proxy:
|
||||
proxy_config = ProxyConfig(
|
||||
server=body.proxy.server,
|
||||
username=body.proxy.username,
|
||||
password=body.proxy.password,
|
||||
)
|
||||
|
||||
crawler = CookieCrawler(
|
||||
headless=settings.crawler_headless,
|
||||
timeout_ms=settings.crawler_timeout_ms,
|
||||
proxy=proxy_config,
|
||||
)
|
||||
result = await crawler.crawl_site(
|
||||
urls, max_pages=min(body.max_pages, settings.max_pages_per_scan)
|
||||
)
|
||||
|
||||
# Build response
|
||||
cookies = [
|
||||
DiscoveredCookieResponse(
|
||||
name=c.name,
|
||||
domain=c.domain,
|
||||
storage_type=c.storage_type,
|
||||
path=c.path,
|
||||
expires=c.expires,
|
||||
http_only=c.http_only,
|
||||
secure=c.secure,
|
||||
same_site=c.same_site,
|
||||
value_length=c.value_length,
|
||||
script_source=c.script_source,
|
||||
page_url=c.page_url,
|
||||
initiator_chain=c.initiator_chain,
|
||||
)
|
||||
for c in result.unique_cookies
|
||||
]
|
||||
|
||||
errors = [p.error for p in result.pages if p.error]
|
||||
|
||||
return ScanResponse(
|
||||
domain=result.domain,
|
||||
pages_crawled=len(result.pages),
|
||||
total_cookies=result.total_cookies_found,
|
||||
cookies=cookies,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
@app.post("/validate", response_model=ValidationResponse)
|
||||
async def run_validation(body: ValidationRequest) -> ValidationResponse:
|
||||
"""Run consent signal validation and dark pattern detection."""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from src.consent_validator import (
|
||||
_is_tracker_request,
|
||||
validate_post_accept,
|
||||
validate_post_reject,
|
||||
validate_pre_consent,
|
||||
)
|
||||
from src.crawler import ProxyConfig
|
||||
from src.dark_pattern_detector import detect_dark_patterns
|
||||
|
||||
response = ValidationResponse(url=body.url)
|
||||
essential_names = set(body.essential_cookie_names)
|
||||
tracker_requests: list[str] = []
|
||||
|
||||
proxy_config = None
|
||||
if body.proxy:
|
||||
proxy_config = ProxyConfig(
|
||||
server=body.proxy.server,
|
||||
username=body.proxy.username,
|
||||
password=body.proxy.password,
|
||||
)
|
||||
|
||||
try:
|
||||
async with async_playwright() as pw:
|
||||
launch_kwargs: dict = {"headless": settings.crawler_headless}
|
||||
if proxy_config:
|
||||
proxy_opts: dict = {"server": proxy_config.server}
|
||||
if proxy_config.username:
|
||||
proxy_opts["username"] = proxy_config.username
|
||||
if proxy_config.password:
|
||||
proxy_opts["password"] = proxy_config.password
|
||||
launch_kwargs["proxy"] = proxy_opts
|
||||
|
||||
browser = await pw.chromium.launch(**launch_kwargs)
|
||||
try:
|
||||
context = await browser.new_context(ignore_https_errors=True)
|
||||
page = await context.new_page()
|
||||
|
||||
# Track network requests for tracker detection
|
||||
def _on_request(request) -> None:
|
||||
if _is_tracker_request(request.url):
|
||||
tracker_requests.append(request.url)
|
||||
|
||||
page.on("request", _on_request)
|
||||
|
||||
# ── Pre-consent check ────────────────────────
|
||||
await page.goto(
|
||||
body.url,
|
||||
wait_until="networkidle",
|
||||
timeout=settings.crawler_timeout_ms,
|
||||
)
|
||||
|
||||
pre_issues = await validate_pre_consent(
|
||||
page, context, essential_names, tracker_requests
|
||||
)
|
||||
response.pre_consent_issues = [
|
||||
ValidationIssueResponse(**vars(i)) for i in pre_issues
|
||||
]
|
||||
|
||||
# ── Dark pattern detection ───────────────────
|
||||
dp_result = await detect_dark_patterns(page)
|
||||
response.banner_found = dp_result.banner_found
|
||||
response.dark_pattern_issues = [
|
||||
DarkPatternIssueResponse(**vars(i)) for i in dp_result.issues
|
||||
]
|
||||
|
||||
# ── Post-accept check ────────────────────────
|
||||
# Try to click Accept All
|
||||
accept_selectors = [
|
||||
"button:has-text('Accept All')",
|
||||
"button:has-text('Accept')",
|
||||
"button:has-text('Allow All')",
|
||||
"button:has-text('I Agree')",
|
||||
"[data-action='accept']",
|
||||
]
|
||||
accepted = False
|
||||
for selector in accept_selectors:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if await btn.is_visible(timeout=1000):
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(2000)
|
||||
accepted = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if accepted:
|
||||
tracker_requests.clear()
|
||||
post_accept = await validate_post_accept(page, context)
|
||||
response.post_accept_issues = [
|
||||
ValidationIssueResponse(**vars(i)) for i in post_accept
|
||||
]
|
||||
|
||||
# ── Post-reject check ────────────────────────
|
||||
# Reload and reject
|
||||
await context.clear_cookies()
|
||||
tracker_requests.clear()
|
||||
await page.goto(
|
||||
body.url,
|
||||
wait_until="networkidle",
|
||||
timeout=settings.crawler_timeout_ms,
|
||||
)
|
||||
|
||||
reject_selectors = [
|
||||
"button:has-text('Reject All')",
|
||||
"button:has-text('Reject')",
|
||||
"button:has-text('Decline')",
|
||||
"button:has-text('Deny')",
|
||||
"[data-action='reject']",
|
||||
]
|
||||
rejected = False
|
||||
for selector in reject_selectors:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if await btn.is_visible(timeout=1000):
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(2000)
|
||||
rejected = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if rejected:
|
||||
post_reject_trackers: list[str] = []
|
||||
# Collect any new tracker requests after rejection
|
||||
for req_url in tracker_requests:
|
||||
if _is_tracker_request(req_url):
|
||||
post_reject_trackers.append(req_url)
|
||||
|
||||
post_reject = await validate_post_reject(
|
||||
page, context, essential_names, post_reject_trackers
|
||||
)
|
||||
response.post_reject_issues = [
|
||||
ValidationIssueResponse(**vars(i)) for i in post_reject
|
||||
]
|
||||
|
||||
await context.close()
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
except Exception as exc:
|
||||
response.errors.append(str(exc))
|
||||
logger.warning("Validation failed for %s: %s", body.url, exc)
|
||||
|
||||
return response
|
||||
|
||||
return app
|
||||
|
||||
|
||||
# ── Entrypoint ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Run the scanner service with uvicorn."""
|
||||
import uvicorn
|
||||
|
||||
settings = ScannerSettings()
|
||||
logging.basicConfig(level=settings.log_level)
|
||||
|
||||
uvicorn.run(
|
||||
"src.worker:create_app",
|
||||
factory=True,
|
||||
host=settings.host,
|
||||
port=settings.port,
|
||||
workers=1, # Single worker — Playwright manages its own concurrency
|
||||
access_log=True,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user