Files
consentos/apps/scanner/src/worker.py
James Cottrill fbf26453f2 feat: initial public release
ConsentOS — a privacy-first cookie consent management platform.

Self-hosted, source-available alternative to OneTrust, Cookiebot, and
CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google
Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant
architecture with role-based access, configuration cascade
(system → org → group → site → region), dark-pattern detection in
the scanner, and a tamper-evident consent record audit trail.

This is the initial public release. Prior development history is
retained internally.

See README.md for the feature list, architecture overview, and
quick-start instructions. Licensed under the Elastic Licence 2.0 —
self-host freely; do not resell as a managed service.
2026-04-14 09:18:18 +00:00

380 lines
13 KiB
Python

"""Scanner HTTP service.
Exposes an HTTP endpoint that accepts scan requests, runs the Playwright
cookie crawler, and returns discovered cookies. Called by the API's Celery
worker to execute scan jobs.
"""
from __future__ import annotations
import logging
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
logger = logging.getLogger(__name__)
# ── Settings ─────────────────────────────────────────────────────────
class ScannerSettings(BaseSettings):
"""Scanner service settings from environment."""
model_config = SettingsConfigDict(env_file=".env", case_sensitive=False)
host: str = "0.0.0.0"
port: int = 8001
log_level: str = "INFO"
crawler_timeout_ms: int = 30_000
crawler_headless: bool = True
max_pages_per_scan: int = 50
# ── Request / Response schemas ───────────────────────────────────────
class ProxyRequest(BaseModel):
"""Proxy configuration for geo-located scanning."""
server: str
username: str | None = None
password: str | None = None
class ScanRequest(BaseModel):
"""Incoming scan request from the API worker."""
domain: str
urls: list[str] = Field(default_factory=list)
max_pages: int = 50
proxy: ProxyRequest | None = None
class DiscoveredCookieResponse(BaseModel):
"""A single cookie found during crawling."""
name: str
domain: str
storage_type: str = "cookie"
path: str | None = None
expires: float | None = None
http_only: bool | None = None
secure: bool | None = None
same_site: str | None = None
value_length: int = 0
script_source: str | None = None
page_url: str = ""
initiator_chain: list[str] = Field(default_factory=list)
class ScanResponse(BaseModel):
"""Result of a scan."""
domain: str
pages_crawled: int
total_cookies: int
cookies: list[DiscoveredCookieResponse]
errors: list[str] = Field(default_factory=list)
class ValidationRequest(BaseModel):
"""Request for consent validation and dark pattern detection."""
url: str
essential_cookie_names: list[str] = Field(default_factory=list)
proxy: ProxyRequest | None = None
class ValidationIssueResponse(BaseModel):
"""A single validation issue."""
check: str
severity: str
message: str
recommendation: str
details: dict = Field(default_factory=dict)
class DarkPatternIssueResponse(BaseModel):
"""A detected dark pattern."""
pattern: str
severity: str
message: str
recommendation: str
details: dict = Field(default_factory=dict)
class ValidationResponse(BaseModel):
"""Result of consent validation and dark pattern detection."""
url: str
pre_consent_issues: list[ValidationIssueResponse] = Field(default_factory=list)
post_accept_issues: list[ValidationIssueResponse] = Field(default_factory=list)
post_reject_issues: list[ValidationIssueResponse] = Field(default_factory=list)
dark_pattern_issues: list[DarkPatternIssueResponse] = Field(default_factory=list)
banner_found: bool = False
errors: list[str] = Field(default_factory=list)
# ── Application ──────────────────────────────────────────────────────
def create_app(): # noqa: ANN201
"""Create the scanner FastAPI application."""
from fastapi import FastAPI, HTTPException
from src.crawler import CookieCrawler
from src.sitemap import discover_urls
app = FastAPI(title="CMP Scanner Service", version="0.1.0")
settings = ScannerSettings()
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
@app.post("/scan", response_model=ScanResponse)
async def run_scan(body: ScanRequest) -> ScanResponse:
"""Execute a Playwright crawl and return discovered cookies."""
# Discover URLs if none provided
urls = body.urls
if not urls:
try:
urls = await discover_urls(
body.domain, max_urls=min(body.max_pages, settings.max_pages_per_scan)
)
except Exception as exc:
logger.warning("URL discovery failed for %s: %s", body.domain, exc)
urls = [f"https://{body.domain}/"]
if not urls:
raise HTTPException(status_code=400, detail="No URLs to scan")
# Run crawler
from src.crawler import ProxyConfig
proxy_config = None
if body.proxy:
proxy_config = ProxyConfig(
server=body.proxy.server,
username=body.proxy.username,
password=body.proxy.password,
)
crawler = CookieCrawler(
headless=settings.crawler_headless,
timeout_ms=settings.crawler_timeout_ms,
proxy=proxy_config,
)
result = await crawler.crawl_site(
urls, max_pages=min(body.max_pages, settings.max_pages_per_scan)
)
# Build response
cookies = [
DiscoveredCookieResponse(
name=c.name,
domain=c.domain,
storage_type=c.storage_type,
path=c.path,
expires=c.expires,
http_only=c.http_only,
secure=c.secure,
same_site=c.same_site,
value_length=c.value_length,
script_source=c.script_source,
page_url=c.page_url,
initiator_chain=c.initiator_chain,
)
for c in result.unique_cookies
]
errors = [p.error for p in result.pages if p.error]
return ScanResponse(
domain=result.domain,
pages_crawled=len(result.pages),
total_cookies=result.total_cookies_found,
cookies=cookies,
errors=errors,
)
@app.post("/validate", response_model=ValidationResponse)
async def run_validation(body: ValidationRequest) -> ValidationResponse:
"""Run consent signal validation and dark pattern detection."""
from playwright.async_api import async_playwright
from src.consent_validator import (
_is_tracker_request,
validate_post_accept,
validate_post_reject,
validate_pre_consent,
)
from src.crawler import ProxyConfig
from src.dark_pattern_detector import detect_dark_patterns
response = ValidationResponse(url=body.url)
essential_names = set(body.essential_cookie_names)
tracker_requests: list[str] = []
proxy_config = None
if body.proxy:
proxy_config = ProxyConfig(
server=body.proxy.server,
username=body.proxy.username,
password=body.proxy.password,
)
try:
async with async_playwright() as pw:
launch_kwargs: dict = {"headless": settings.crawler_headless}
if proxy_config:
proxy_opts: dict = {"server": proxy_config.server}
if proxy_config.username:
proxy_opts["username"] = proxy_config.username
if proxy_config.password:
proxy_opts["password"] = proxy_config.password
launch_kwargs["proxy"] = proxy_opts
browser = await pw.chromium.launch(**launch_kwargs)
try:
context = await browser.new_context(ignore_https_errors=True)
page = await context.new_page()
# Track network requests for tracker detection
def _on_request(request) -> None:
if _is_tracker_request(request.url):
tracker_requests.append(request.url)
page.on("request", _on_request)
# ── Pre-consent check ────────────────────────
await page.goto(
body.url,
wait_until="networkidle",
timeout=settings.crawler_timeout_ms,
)
pre_issues = await validate_pre_consent(
page, context, essential_names, tracker_requests
)
response.pre_consent_issues = [
ValidationIssueResponse(**vars(i)) for i in pre_issues
]
# ── Dark pattern detection ───────────────────
dp_result = await detect_dark_patterns(page)
response.banner_found = dp_result.banner_found
response.dark_pattern_issues = [
DarkPatternIssueResponse(**vars(i)) for i in dp_result.issues
]
# ── Post-accept check ────────────────────────
# Try to click Accept All
accept_selectors = [
"button:has-text('Accept All')",
"button:has-text('Accept')",
"button:has-text('Allow All')",
"button:has-text('I Agree')",
"[data-action='accept']",
]
accepted = False
for selector in accept_selectors:
try:
btn = page.locator(selector).first
if await btn.is_visible(timeout=1000):
await btn.click()
await page.wait_for_timeout(2000)
accepted = True
break
except Exception:
continue
if accepted:
tracker_requests.clear()
post_accept = await validate_post_accept(page, context)
response.post_accept_issues = [
ValidationIssueResponse(**vars(i)) for i in post_accept
]
# ── Post-reject check ────────────────────────
# Reload and reject
await context.clear_cookies()
tracker_requests.clear()
await page.goto(
body.url,
wait_until="networkidle",
timeout=settings.crawler_timeout_ms,
)
reject_selectors = [
"button:has-text('Reject All')",
"button:has-text('Reject')",
"button:has-text('Decline')",
"button:has-text('Deny')",
"[data-action='reject']",
]
rejected = False
for selector in reject_selectors:
try:
btn = page.locator(selector).first
if await btn.is_visible(timeout=1000):
await btn.click()
await page.wait_for_timeout(2000)
rejected = True
break
except Exception:
continue
if rejected:
post_reject_trackers: list[str] = []
# Collect any new tracker requests after rejection
for req_url in tracker_requests:
if _is_tracker_request(req_url):
post_reject_trackers.append(req_url)
post_reject = await validate_post_reject(
page, context, essential_names, post_reject_trackers
)
response.post_reject_issues = [
ValidationIssueResponse(**vars(i)) for i in post_reject
]
await context.close()
finally:
await browser.close()
except Exception as exc:
response.errors.append(str(exc))
logger.warning("Validation failed for %s: %s", body.url, exc)
return response
return app
# ── Entrypoint ───────────────────────────────────────────────────────
def main() -> None:
"""Run the scanner service with uvicorn."""
import uvicorn
settings = ScannerSettings()
logging.basicConfig(level=settings.log_level)
uvicorn.run(
"src.worker:create_app",
factory=True,
host=settings.host,
port=settings.port,
workers=1, # Single worker — Playwright manages its own concurrency
access_log=True,
)
if __name__ == "__main__":
main()