feat: initial public release

ConsentOS — a privacy-first cookie consent management platform.

Self-hosted, source-available alternative to OneTrust, Cookiebot, and
CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google
Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant
architecture with role-based access, configuration cascade
(system → org → group → site → region), dark-pattern detection in
the scanner, and a tamper-evident consent record audit trail.

This is the initial public release. Prior development history is
retained internally.

See README.md for the feature list, architecture overview, and
quick-start instructions. Licensed under the Elastic Licence 2.0 —
self-host freely; do not resell as a managed service.
This commit is contained in:
James Cottrill
2026-04-13 14:20:15 +00:00
commit fbf26453f2
341 changed files with 62807 additions and 0 deletions

View File

View File

@@ -0,0 +1,144 @@
"""Tests for cookie classification — CMP-21."""
from src.classifier import (
ClassificationResult,
KnownPattern,
_domain_matches,
classify_cookie,
)
# ── Domain matching ──────────────────────────────────────────────────
class TestDomainMatching:
def test_wildcard_matches_any(self):
assert _domain_matches("example.com", "*") is True
def test_exact_match(self):
assert _domain_matches("example.com", "example.com") is True
def test_exact_no_match(self):
assert _domain_matches("other.com", "example.com") is False
def test_subdomain_match(self):
assert _domain_matches("sub.example.com", "example.com") is True
def test_leading_dot_stripped(self):
assert _domain_matches(".example.com", "example.com") is True
def test_pattern_leading_dot(self):
assert _domain_matches("example.com", ".example.com") is True
def test_case_insensitive(self):
assert _domain_matches("Example.COM", "example.com") is True
def test_no_partial_match(self):
# "notexample.com" should NOT match "example.com"
assert _domain_matches("notexample.com", "example.com") is False
# ── Cookie classification ────────────────────────────────────────────
PATTERNS = [
KnownPattern(name_pattern="_ga", domain_pattern="*", category="analytics", vendor="Google"),
KnownPattern(name_pattern="_ga_*", domain_pattern="*", category="analytics", vendor="Google"),
KnownPattern(name_pattern="_gid", domain_pattern="*", category="analytics", vendor="Google"),
KnownPattern(
name_pattern="_fbp", domain_pattern=".facebook.com", category="marketing", vendor="Meta"
),
KnownPattern(
name_pattern="__cf_bm",
domain_pattern="*",
category="necessary",
vendor="Cloudflare",
),
KnownPattern(
name_pattern="_hj.*",
domain_pattern="*",
category="analytics",
vendor="Hotjar",
is_regex=True,
),
KnownPattern(
name_pattern="^_pk_id\\..*",
domain_pattern="*",
category="analytics",
vendor="Matomo",
is_regex=True,
),
]
class TestClassifyCookie:
def test_exact_match(self):
result = classify_cookie("_ga", "example.com", PATTERNS)
assert result.category == "analytics"
assert result.vendor == "Google"
assert result.match_source == "exact"
def test_wildcard_match(self):
result = classify_cookie("_ga_ABC123", "example.com", PATTERNS)
assert result.category == "analytics"
assert result.match_source == "wildcard"
def test_regex_match(self):
result = classify_cookie("_hjSession_123", "example.com", PATTERNS)
assert result.category == "analytics"
assert result.vendor == "Hotjar"
assert result.match_source == "regex"
def test_regex_matomo(self):
result = classify_cookie("_pk_id.1.abc1", "example.com", PATTERNS)
assert result.category == "analytics"
assert result.vendor == "Matomo"
assert result.match_source == "regex"
def test_domain_specific_match(self):
result = classify_cookie("_fbp", "sub.facebook.com", PATTERNS)
assert result.category == "marketing"
assert result.vendor == "Meta"
def test_domain_mismatch(self):
result = classify_cookie("_fbp", "example.com", PATTERNS)
assert result.category is None
assert result.match_source == "unmatched"
def test_unmatched_cookie(self):
result = classify_cookie("unknown_cookie", "example.com", PATTERNS)
assert result.category is None
assert result.match_source == "unmatched"
def test_necessary_cookie(self):
result = classify_cookie("__cf_bm", "example.com", PATTERNS)
assert result.category == "necessary"
assert result.vendor == "Cloudflare"
def test_empty_patterns(self):
result = classify_cookie("_ga", "example.com", [])
assert result.category is None
def test_exact_takes_priority_over_wildcard(self):
"""Exact match should come before wildcard in pattern list."""
patterns = [
KnownPattern(name_pattern="_ga", domain_pattern="*", category="analytics"),
KnownPattern(name_pattern="_ga*", domain_pattern="*", category="marketing"),
]
result = classify_cookie("_ga", "example.com", patterns)
assert result.category == "analytics"
assert result.match_source == "exact"
# ── ClassificationResult ─────────────────────────────────────────────
class TestClassificationResult:
def test_defaults(self):
r = ClassificationResult(category=None)
assert r.vendor is None
assert r.match_source == "unmatched"
def test_with_values(self):
r = ClassificationResult(category="analytics", vendor="Google", match_source="exact")
assert r.category == "analytics"
assert r.vendor == "Google"

View File

@@ -0,0 +1,112 @@
"""Tests for consent signal validation — mocks Playwright."""
from unittest.mock import AsyncMock
import pytest
from src.consent_validator import (
_is_tracker_request,
validate_post_reject,
validate_pre_consent,
)
class TestIsTrackerRequest:
def test_known_tracker(self) -> None:
assert _is_tracker_request("https://www.google-analytics.com/collect") is True
def test_facebook_tracker(self) -> None:
assert _is_tracker_request("https://connect.facebook.net/en_US/fbevents.js") is True
def test_non_tracker(self) -> None:
assert _is_tracker_request("https://example.com/style.css") is False
def test_empty_url(self) -> None:
assert _is_tracker_request("") is False
def test_doubleclick(self) -> None:
assert _is_tracker_request("https://ad.doubleclick.net/pixel") is True
def test_hotjar(self) -> None:
assert _is_tracker_request("https://static.hotjar.com/c/hotjar.js") is True
class TestValidatePreConsent:
@pytest.mark.asyncio
async def test_no_issues_with_only_essential_cookies(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(return_value={"available": False})
context = AsyncMock()
context.cookies = AsyncMock(return_value=[{"name": "session_id", "domain": "example.com"}])
issues = await validate_pre_consent(page, context, {"session_id"}, [])
assert len(issues) == 0
@pytest.mark.asyncio
async def test_non_essential_cookies_flagged(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(return_value={"available": False})
context = AsyncMock()
context.cookies = AsyncMock(
return_value=[
{"name": "session_id", "domain": "example.com"},
{"name": "_ga", "domain": ".google-analytics.com"},
{"name": "_fbp", "domain": ".facebook.com"},
]
)
issues = await validate_pre_consent(page, context, {"session_id"}, [])
assert len(issues) >= 1
cookie_issue = next(i for i in issues if i.check == "pre_consent_cookies")
assert cookie_issue.severity == "critical"
assert "_ga" in cookie_issue.message
@pytest.mark.asyncio
async def test_tracker_requests_flagged(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(return_value={"available": False})
context = AsyncMock()
context.cookies = AsyncMock(return_value=[])
tracker_urls = ["https://www.google-analytics.com/collect?v=1"]
issues = await validate_pre_consent(page, context, set(), tracker_urls)
assert len(issues) >= 1
tracker_issue = next(i for i in issues if i.check == "pre_consent_trackers")
assert tracker_issue.severity == "critical"
class TestValidatePostReject:
@pytest.mark.asyncio
async def test_clean_rejection(self) -> None:
page = AsyncMock()
context = AsyncMock()
context.cookies = AsyncMock(return_value=[])
issues = await validate_post_reject(page, context, set(), [])
assert len(issues) == 0
@pytest.mark.asyncio
async def test_cookies_after_reject_flagged(self) -> None:
page = AsyncMock()
context = AsyncMock()
context.cookies = AsyncMock(
return_value=[{"name": "_ga", "domain": ".google-analytics.com"}]
)
issues = await validate_post_reject(page, context, set(), [])
assert len(issues) >= 1
assert issues[0].check == "post_reject_cookies"
@pytest.mark.asyncio
async def test_trackers_after_reject_flagged(self) -> None:
page = AsyncMock()
context = AsyncMock()
context.cookies = AsyncMock(return_value=[])
tracker_urls = ["https://www.google-analytics.com/collect"]
issues = await validate_post_reject(page, context, set(), tracker_urls)
assert len(issues) >= 1
assert issues[0].check == "post_reject_trackers"

View File

@@ -0,0 +1,440 @@
"""Tests for the Playwright cookie crawler — CMP-21.
These tests mock Playwright to avoid requiring an actual browser.
"""
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.crawler import (
CookieCrawler,
CrawlResult,
DiscoveredCookie,
SiteCrawlResult,
_build_initiator_chain,
_get_script_initiator,
)
# ── Fixtures ────────────────────────────────────────────────────────────
def _make_mock_page(
*,
cookies: list[dict] | None = None,
ls_items: list[dict] | None = None,
ss_items: list[dict] | None = None,
):
"""Build a mock Playwright Page object."""
page = AsyncMock()
page.goto = AsyncMock()
page.on = MagicMock() # synchronous registration
# page.evaluate returns different results for localStorage vs sessionStorage
eval_results = []
eval_results.append(ls_items or [])
eval_results.append(ss_items or [])
page.evaluate = AsyncMock(side_effect=eval_results)
return page
def _make_mock_context(page, cookies: list[dict] | None = None):
"""Build a mock BrowserContext."""
context = AsyncMock()
context.new_page = AsyncMock(return_value=page)
context.cookies = AsyncMock(return_value=cookies or [])
context.clear_cookies = AsyncMock()
context.close = AsyncMock()
return context
def _make_mock_browser(context):
"""Build a mock Browser."""
browser = AsyncMock()
browser.new_context = AsyncMock(return_value=context)
browser.close = AsyncMock()
return browser
# ── DiscoveredCookie dataclass ──────────────────────────────────────────
class TestDiscoveredCookie:
def test_defaults(self):
c = DiscoveredCookie(name="_ga", domain="example.com")
assert c.storage_type == "cookie"
assert c.path is None
assert c.expires is None
assert c.http_only is None
assert c.secure is None
assert c.same_site is None
assert c.value_length == 0
assert c.script_source is None
assert c.page_url == ""
def test_initiator_chain_defaults_to_empty(self):
c = DiscoveredCookie(name="_ga", domain="example.com")
assert c.initiator_chain == []
def test_with_all_fields(self):
c = DiscoveredCookie(
name="_ga",
domain=".example.com",
storage_type="cookie",
path="/",
expires=1700000000.0,
http_only=True,
secure=True,
same_site="Lax",
value_length=42,
script_source="https://cdn.example.com/tracker.js",
page_url="https://example.com/",
initiator_chain=["https://example.com/", "https://cdn.example.com/tracker.js"],
)
assert c.http_only is True
assert c.value_length == 42
assert len(c.initiator_chain) == 2
# ── CrawlResult dataclass ──────────────────────────────────────────────
class TestCrawlResult:
def test_defaults(self):
r = CrawlResult(url="https://example.com/")
assert r.cookies == []
assert r.error is None
def test_with_error(self):
r = CrawlResult(url="https://example.com/", error="Timeout")
assert r.error == "Timeout"
# ── SiteCrawlResult ────────────────────────────────────────────────────
class TestSiteCrawlResult:
def test_unique_cookies_deduplicates(self):
cookie_a = DiscoveredCookie(name="_ga", domain="example.com", storage_type="cookie")
cookie_b = DiscoveredCookie(name="_ga", domain="example.com", storage_type="cookie")
cookie_c = DiscoveredCookie(name="_gid", domain="example.com", storage_type="cookie")
result = SiteCrawlResult(
domain="example.com",
pages=[
CrawlResult(url="https://example.com/", cookies=[cookie_a, cookie_c]),
CrawlResult(url="https://example.com/about", cookies=[cookie_b]),
],
total_cookies_found=3,
)
unique = result.unique_cookies
assert len(unique) == 2
names = {c.name for c in unique}
assert names == {"_ga", "_gid"}
def test_unique_cookies_separates_storage_types(self):
"""Same name in cookie vs localStorage should be separate entries."""
cookie = DiscoveredCookie(name="token", domain="example.com", storage_type="cookie")
ls = DiscoveredCookie(name="token", domain="example.com", storage_type="local_storage")
result = SiteCrawlResult(
domain="example.com",
pages=[CrawlResult(url="https://example.com/", cookies=[cookie, ls])],
total_cookies_found=2,
)
assert len(result.unique_cookies) == 2
def test_empty_pages(self):
result = SiteCrawlResult(domain="example.com")
assert result.unique_cookies == []
# ── _get_script_initiator ──────────────────────────────────────────────
class TestGetScriptInitiator:
def test_identifies_js_url(self):
request = MagicMock()
request.url = "https://cdn.example.com/tracker.js"
request.resource_type = "script"
request.redirected_from = None
assert _get_script_initiator(request) == "https://cdn.example.com/tracker.js"
def test_follows_redirect_chain(self):
original = MagicMock()
original.url = "https://cdn.example.com/analytics.js"
original.resource_type = "script"
original.redirected_from = None
redirect = MagicMock()
redirect.url = "https://example.com/track"
redirect.resource_type = "fetch"
redirect.redirected_from = original
assert _get_script_initiator(redirect) == "https://cdn.example.com/analytics.js"
def test_returns_none_for_non_script(self):
request = MagicMock()
request.url = "https://example.com/image.png"
request.resource_type = "image"
request.redirected_from = None
assert _get_script_initiator(request) is None
def test_handles_javascript_resource_type(self):
request = MagicMock()
request.url = "https://example.com/bundle"
request.resource_type = "javascript"
request.redirected_from = None
assert _get_script_initiator(request) == "https://example.com/bundle"
def test_handles_circular_redirect(self):
"""Should not loop infinitely on circular redirects."""
req_a = MagicMock()
req_a.url = "https://example.com/a"
req_a.resource_type = "fetch"
req_b = MagicMock()
req_b.url = "https://example.com/b"
req_b.resource_type = "fetch"
# Create circular chain
req_a.redirected_from = req_b
req_b.redirected_from = req_a
# Should not hang — returns None since neither is a script
result = _get_script_initiator(req_a)
assert result is None
# ── _build_initiator_chain ────────────────────────────────────────────
class TestBuildInitiatorChain:
def test_single_url_no_parent(self):
chain = _build_initiator_chain("https://example.com/script.js", {})
assert chain == ["https://example.com/script.js"]
def test_two_level_chain(self):
imap = {"https://cdn.example.com/tracker.js": "https://example.com/"}
chain = _build_initiator_chain("https://cdn.example.com/tracker.js", imap)
assert chain == ["https://example.com/", "https://cdn.example.com/tracker.js"]
def test_three_level_chain(self):
imap = {
"https://cdn.example.com/pixel.js": "https://cdn.example.com/gtm.js",
"https://cdn.example.com/gtm.js": "https://example.com/",
}
chain = _build_initiator_chain("https://cdn.example.com/pixel.js", imap)
assert chain == [
"https://example.com/",
"https://cdn.example.com/gtm.js",
"https://cdn.example.com/pixel.js",
]
def test_respects_max_depth(self):
# Build a chain longer than max_depth
imap = {}
for i in range(25):
imap[f"https://example.com/s{i + 1}.js"] = f"https://example.com/s{i}.js"
chain = _build_initiator_chain("https://example.com/s25.js", imap, max_depth=5)
# Should be capped: the leaf + 5 parents = 6 entries at most
assert len(chain) <= 6
def test_handles_circular_reference(self):
imap = {
"https://a.com/a.js": "https://b.com/b.js",
"https://b.com/b.js": "https://a.com/a.js",
}
chain = _build_initiator_chain("https://a.com/a.js", imap)
# Should not loop — cycle detected via seen set
assert len(chain) == 2
# ── CookieCrawler._crawl_page ──────────────────────────────────────────
class TestCrawlPage:
@pytest.mark.asyncio(loop_scope="session")
async def test_discovers_browser_cookies(self):
cdp_cookies = [
{
"name": "_ga",
"domain": ".example.com",
"path": "/",
"expires": 1700000000,
"httpOnly": False,
"secure": True,
"sameSite": "Lax",
"value": "GA1.2.12345",
}
]
page = _make_mock_page()
context = _make_mock_context(page, cookies=cdp_cookies)
browser = _make_mock_browser(context)
crawler = CookieCrawler()
result = await crawler._crawl_page(browser, "https://example.com/")
assert len(result.cookies) == 1
assert result.cookies[0].name == "_ga"
assert result.cookies[0].domain == ".example.com"
assert result.cookies[0].storage_type == "cookie"
assert result.cookies[0].secure is True
assert result.cookies[0].value_length == len("GA1.2.12345")
assert result.error is None
@pytest.mark.asyncio(loop_scope="session")
async def test_discovers_local_storage(self):
ls_items = [{"name": "theme", "valueLength": 4}]
page = _make_mock_page(ls_items=ls_items)
context = _make_mock_context(page)
browser = _make_mock_browser(context)
crawler = CookieCrawler()
result = await crawler._crawl_page(browser, "https://example.com/")
ls_cookies = [c for c in result.cookies if c.storage_type == "local_storage"]
assert len(ls_cookies) == 1
assert ls_cookies[0].name == "theme"
assert ls_cookies[0].value_length == 4
assert ls_cookies[0].domain == "example.com"
@pytest.mark.asyncio(loop_scope="session")
async def test_discovers_session_storage(self):
ss_items = [{"name": "session_id", "valueLength": 36}]
page = _make_mock_page(ss_items=ss_items)
context = _make_mock_context(page)
browser = _make_mock_browser(context)
crawler = CookieCrawler()
result = await crawler._crawl_page(browser, "https://example.com/")
ss_cookies = [c for c in result.cookies if c.storage_type == "session_storage"]
assert len(ss_cookies) == 1
assert ss_cookies[0].name == "session_id"
@pytest.mark.asyncio(loop_scope="session")
async def test_handles_page_error(self):
page = _make_mock_page()
page.goto = AsyncMock(side_effect=Exception("Navigation timeout"))
context = _make_mock_context(page)
browser = _make_mock_browser(context)
crawler = CookieCrawler()
result = await crawler._crawl_page(browser, "https://example.com/")
assert result.error == "Navigation timeout"
@pytest.mark.asyncio(loop_scope="session")
async def test_context_closed_after_crawl(self):
page = _make_mock_page()
context = _make_mock_context(page)
browser = _make_mock_browser(context)
crawler = CookieCrawler()
await crawler._crawl_page(browser, "https://example.com/")
context.close.assert_awaited_once()
@pytest.mark.asyncio(loop_scope="session")
async def test_context_closed_on_error(self):
page = _make_mock_page()
page.goto = AsyncMock(side_effect=Exception("fail"))
context = _make_mock_context(page)
browser = _make_mock_browser(context)
crawler = CookieCrawler()
await crawler._crawl_page(browser, "https://example.com/")
context.close.assert_awaited_once()
@pytest.mark.asyncio(loop_scope="session")
async def test_custom_user_agent(self):
page = _make_mock_page()
context = _make_mock_context(page)
browser = _make_mock_browser(context)
crawler = CookieCrawler(user_agent="CMPBot/1.0")
await crawler._crawl_page(browser, "https://example.com/")
browser.new_context.assert_awaited_once()
call_kwargs = browser.new_context.call_args[1]
assert call_kwargs["user_agent"] == "CMPBot/1.0"
# ── CookieCrawler.crawl_site ───────────────────────────────────────────
class TestCrawlSite:
@pytest.mark.asyncio(loop_scope="session")
@patch("src.crawler.async_playwright")
async def test_crawls_multiple_pages(self, mock_pw):
cdp_cookies = [{"name": "_ga", "domain": ".example.com", "value": "x"}]
page = _make_mock_page()
context = _make_mock_context(page, cookies=cdp_cookies)
browser = _make_mock_browser(context)
pw_instance = AsyncMock()
pw_instance.chromium.launch = AsyncMock(return_value=browser)
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
crawler = CookieCrawler()
result = await crawler.crawl_site(["https://example.com/", "https://example.com/about"])
assert result.domain == "example.com"
assert len(result.pages) == 2
assert result.total_cookies_found >= 2
@pytest.mark.asyncio(loop_scope="session")
@patch("src.crawler.async_playwright")
async def test_respects_max_pages(self, mock_pw):
page = _make_mock_page()
context = _make_mock_context(page)
browser = _make_mock_browser(context)
pw_instance = AsyncMock()
pw_instance.chromium.launch = AsyncMock(return_value=browser)
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
urls = [f"https://example.com/page{i}" for i in range(10)]
crawler = CookieCrawler()
result = await crawler.crawl_site(urls, max_pages=3)
assert len(result.pages) == 3
@pytest.mark.asyncio(loop_scope="session")
async def test_empty_urls(self):
crawler = CookieCrawler()
result = await crawler.crawl_site([])
assert result.domain == ""
assert result.pages == []
@pytest.mark.asyncio(loop_scope="session")
@patch("src.crawler.async_playwright")
async def test_browser_closed_after_crawl(self, mock_pw):
page = _make_mock_page()
context = _make_mock_context(page)
browser = _make_mock_browser(context)
pw_instance = AsyncMock()
pw_instance.chromium.launch = AsyncMock(return_value=browser)
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
crawler = CookieCrawler()
await crawler.crawl_site(["https://example.com/"])
browser.close.assert_awaited_once()

View File

@@ -0,0 +1,100 @@
"""Tests for crawler proxy configuration.
Mocks Playwright to avoid requiring an actual browser installation.
"""
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.crawler import CookieCrawler, ProxyConfig
class TestProxyConfig:
"""Tests for ProxyConfig dataclass."""
def test_proxy_config_creation(self) -> None:
proxy = ProxyConfig(server="http://proxy.example.com:8080")
assert proxy.server == "http://proxy.example.com:8080"
assert proxy.username is None
assert proxy.password is None
def test_proxy_config_with_auth(self) -> None:
proxy = ProxyConfig(
server="http://proxy.example.com:8080",
username="user",
password="pass",
)
assert proxy.username == "user"
assert proxy.password == "pass"
class TestCookieCrawlerProxy:
"""Tests for CookieCrawler proxy support."""
def test_crawler_without_proxy(self) -> None:
crawler = CookieCrawler(headless=True)
assert crawler._proxy is None
def test_crawler_with_proxy(self) -> None:
proxy = ProxyConfig(server="http://proxy.example.com:8080")
crawler = CookieCrawler(headless=True, proxy=proxy)
assert crawler._proxy is not None
assert crawler._proxy.server == "http://proxy.example.com:8080"
def test_crawler_with_socks5_proxy(self) -> None:
proxy = ProxyConfig(server="socks5://proxy.example.com:1080")
crawler = CookieCrawler(headless=True, proxy=proxy)
assert crawler._proxy.server == "socks5://proxy.example.com:1080"
@pytest.mark.asyncio
async def test_crawl_passes_proxy_to_browser(self) -> None:
"""Verify that proxy config is passed to Playwright launch."""
proxy = ProxyConfig(
server="http://proxy.example.com:8080",
username="user",
password="pass",
)
crawler = CookieCrawler(headless=True, proxy=proxy)
mock_browser = AsyncMock()
mock_browser.close = AsyncMock()
mock_pw = MagicMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
mock_context_manager = AsyncMock()
mock_context_manager.__aenter__ = AsyncMock(return_value=mock_pw)
mock_context_manager.__aexit__ = AsyncMock(return_value=False)
with patch("src.crawler.async_playwright", return_value=mock_context_manager):
await crawler.crawl_site(["https://example.com/"], max_pages=1)
# Verify proxy was passed to browser launch
mock_pw.chromium.launch.assert_called_once()
call_kwargs = mock_pw.chromium.launch.call_args[1]
assert "proxy" in call_kwargs
assert call_kwargs["proxy"]["server"] == "http://proxy.example.com:8080"
assert call_kwargs["proxy"]["username"] == "user"
assert call_kwargs["proxy"]["password"] == "pass"
@pytest.mark.asyncio
async def test_crawl_without_proxy_omits_proxy_kwarg(self) -> None:
"""Verify that no proxy is passed when none is configured."""
crawler = CookieCrawler(headless=True)
mock_browser = AsyncMock()
mock_browser.close = AsyncMock()
mock_pw = MagicMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
mock_context_manager = AsyncMock()
mock_context_manager.__aenter__ = AsyncMock(return_value=mock_pw)
mock_context_manager.__aexit__ = AsyncMock(return_value=False)
with patch("src.crawler.async_playwright", return_value=mock_context_manager):
await crawler.crawl_site(["https://example.com/"], max_pages=1)
call_kwargs = mock_pw.chromium.launch.call_args[1]
assert "proxy" not in call_kwargs

View File

@@ -0,0 +1,153 @@
"""Tests for dark pattern detection — mocks Playwright."""
from unittest.mock import AsyncMock
import pytest
from src.dark_pattern_detector import (
check_button_prominence,
check_cookie_wall,
check_pre_ticked_boxes,
detect_dark_patterns,
)
class TestCheckButtonProminence:
@pytest.mark.asyncio
async def test_no_accept_button_returns_empty(self) -> None:
page = AsyncMock()
page.query_selector_all = AsyncMock(return_value=[])
issues = await check_button_prominence(page)
assert issues == []
@pytest.mark.asyncio
async def test_missing_reject_button_flagged(self) -> None:
# Accept button visible, reject not found
accept_el = AsyncMock()
accept_el.is_visible = AsyncMock(return_value=True)
accept_el.evaluate = AsyncMock(
return_value={
"width": 200,
"height": 40,
"area": 8000,
"backgroundColor": "rgb(37, 99, 235)",
"color": "rgb(255, 255, 255)",
"fontSize": 16,
"fontWeight": "600",
"padding": "8px 16px",
"text": "Accept All",
"visible": True,
}
)
call_count = 0
async def _mock_query(selector):
nonlocal call_count
call_count += 1
# First batch of calls = accept selectors, return button
# Remaining calls = reject selectors, return empty
if "Accept" in selector or "Allow" in selector or "accept" in selector:
return [accept_el]
return []
page = AsyncMock()
page.query_selector_all = _mock_query
issues = await check_button_prominence(page)
assert any(i.pattern == "missing_reject_button" for i in issues)
@pytest.mark.asyncio
async def test_unequal_button_size_flagged(self) -> None:
accept_el = AsyncMock()
accept_el.is_visible = AsyncMock(return_value=True)
accept_el.evaluate = AsyncMock(
return_value={
"width": 300,
"height": 50,
"area": 15000,
"fontSize": 18,
"fontWeight": "700",
"text": "Accept All",
"visible": True,
}
)
reject_el = AsyncMock()
reject_el.is_visible = AsyncMock(return_value=True)
reject_el.evaluate = AsyncMock(
return_value={
"width": 100,
"height": 30,
"area": 3000,
"fontSize": 12,
"fontWeight": "400",
"text": "Reject",
"visible": True,
}
)
async def _mock_query(selector):
if "Accept" in selector or "Allow" in selector or "accept" in selector:
return [accept_el]
if "Reject" in selector or "Decline" in selector or "reject" in selector:
return [reject_el]
return []
page = AsyncMock()
page.query_selector_all = _mock_query
issues = await check_button_prominence(page)
assert any(i.pattern == "unequal_button_size" for i in issues)
class TestCheckPreTickedBoxes:
@pytest.mark.asyncio
async def test_no_pre_ticked_returns_empty(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(return_value=[])
issues = await check_pre_ticked_boxes(page)
assert issues == []
@pytest.mark.asyncio
async def test_pre_ticked_non_essential_flagged(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(
return_value=[
{"name": "analytics", "label": "Analytics Cookies"},
{"name": "marketing", "label": "Marketing Cookies"},
]
)
issues = await check_pre_ticked_boxes(page)
assert len(issues) == 1
assert issues[0].pattern == "pre_ticked_checkboxes"
assert issues[0].severity == "critical"
class TestCheckCookieWall:
@pytest.mark.asyncio
async def test_no_wall_returns_empty(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(return_value=False)
issues = await check_cookie_wall(page)
assert issues == []
@pytest.mark.asyncio
async def test_wall_detected(self) -> None:
page = AsyncMock()
page.evaluate = AsyncMock(return_value=True)
issues = await check_cookie_wall(page)
assert len(issues) == 1
assert issues[0].pattern == "cookie_wall"
assert issues[0].severity == "critical"
class TestDetectDarkPatterns:
@pytest.mark.asyncio
async def test_no_banner_returns_empty(self) -> None:
page = AsyncMock()
page.url = "https://example.com/"
page.query_selector_all = AsyncMock(return_value=[])
result = await detect_dark_patterns(page)
assert result.banner_found is False
assert result.issues == []

View File

@@ -0,0 +1,275 @@
"""Tests for sitemap URL discovery — CMP-21."""
from unittest.mock import AsyncMock, patch
import httpx
import pytest
from src.sitemap import _fetch_sitemap, _find_sitemap_in_robots, discover_urls
# ── Helpers ─────────────────────────────────────────────────────────────
def _make_response(status_code: int = 200, text: str = "") -> httpx.Response:
"""Build a fake httpx.Response."""
return httpx.Response(
status_code=status_code, text=text, request=httpx.Request("GET", "http://x")
)
SITEMAP_XML = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
<url><loc>https://example.com/page3</loc></url>
</urlset>
"""
SITEMAP_INDEX_XML = """\
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://example.com/sitemap-main.xml</loc></sitemap>
<sitemap><loc>https://example.com/sitemap-blog.xml</loc></sitemap>
</sitemapindex>
"""
CHILD_SITEMAP_XML = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/blog/post1</loc></url>
<url><loc>https://example.com/blog/post2</loc></url>
</urlset>
"""
ROBOTS_TXT_WITH_SITEMAP = """\
User-agent: *
Disallow: /admin/
Sitemap: https://example.com/custom-sitemap.xml
"""
ROBOTS_TXT_NO_SITEMAP = """\
User-agent: *
Disallow: /admin/
"""
# ── _fetch_sitemap ─────────────────────────────────────────────────────
class TestFetchSitemap:
@pytest.mark.asyncio(loop_scope="session")
async def test_parses_regular_sitemap(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
]
@pytest.mark.asyncio(loop_scope="session")
async def test_respects_max_urls(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 2)
assert len(urls) == 2
@pytest.mark.asyncio(loop_scope="session")
async def test_handles_sitemap_index(self):
"""Sitemap index should recursively fetch child sitemaps."""
responses = {
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
}
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=lambda url: responses[url])
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert len(urls) == 5
assert "https://example.com/page1" in urls
assert "https://example.com/blog/post1" in urls
@pytest.mark.asyncio(loop_scope="session")
async def test_sitemap_index_respects_max_urls(self):
"""Should stop fetching child sitemaps once max_urls is reached."""
responses = {
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
}
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=lambda url: responses[url])
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 3)
assert len(urls) == 3
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_empty_on_404(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(404))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_empty_on_invalid_xml(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, "not xml at all"))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_empty_on_network_error(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
@pytest.mark.asyncio(loop_scope="session")
async def test_empty_urlset(self):
empty_sitemap = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>
"""
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, empty_sitemap))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
# ── _find_sitemap_in_robots ────────────────────────────────────────────
class TestFindSitemapInRobots:
@pytest.mark.asyncio(loop_scope="session")
async def test_finds_sitemap_directive(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_WITH_SITEMAP))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url == "https://example.com/custom-sitemap.xml"
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_none_when_no_directive(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_NO_SITEMAP))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url is None
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_none_on_404(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(404))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url is None
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_none_on_network_error(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url is None
@pytest.mark.asyncio(loop_scope="session")
async def test_case_insensitive_directive(self):
robots = "User-agent: *\nsITEMAP: https://example.com/sm.xml\n"
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, robots))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url == "https://example.com/sm.xml"
# ── discover_urls ──────────────────────────────────────────────────────
class TestDiscoverUrls:
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_returns_sitemap_urls(self, mock_robots, mock_sitemap):
"""Should return URLs from /sitemap.xml when available."""
mock_sitemap.return_value = [
"https://example.com/page1",
"https://example.com/page2",
]
urls = await discover_urls("example.com")
assert urls == ["https://example.com/page1", "https://example.com/page2"]
mock_robots.assert_not_called()
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_falls_back_to_robots_txt(self, mock_robots, mock_sitemap):
"""When sitemap.xml returns nothing, should try robots.txt."""
mock_sitemap.side_effect = [[], ["https://example.com/from-robots"]]
mock_robots.return_value = "https://example.com/alt-sitemap.xml"
urls = await discover_urls("example.com")
assert urls == ["https://example.com/from-robots"]
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_falls_back_to_default_paths(self, mock_robots, mock_sitemap):
"""When no sitemap exists, should return default paths."""
mock_sitemap.return_value = []
mock_robots.return_value = None
urls = await discover_urls("example.com")
assert "https://example.com/" in urls
assert "https://example.com/privacy" in urls
assert "https://example.com/cookie-policy" in urls
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_respects_max_urls(self, mock_robots, mock_sitemap):
many_urls = [f"https://example.com/page{i}" for i in range(100)]
mock_sitemap.return_value = many_urls
urls = await discover_urls("example.com", max_urls=5)
assert len(urls) == 5
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_default_paths_respect_max_urls(self, mock_robots, mock_sitemap):
mock_sitemap.return_value = []
mock_robots.return_value = None
urls = await discover_urls("example.com", max_urls=3)
assert len(urls) == 3

View File

@@ -0,0 +1,122 @@
"""Tests for the scanner HTTP service."""
from unittest.mock import AsyncMock, patch
import pytest
from fastapi.testclient import TestClient
from src.worker import create_app
@pytest.fixture
def client():
"""Create a test client for the scanner app."""
app = create_app()
return TestClient(app)
def test_health_endpoint(client):
"""Health endpoint returns ok."""
resp = client.get("/health")
assert resp.status_code == 200
assert resp.json() == {"status": "ok"}
@patch("src.sitemap.discover_urls", new_callable=AsyncMock)
@patch("src.crawler.CookieCrawler.crawl_site", new_callable=AsyncMock)
def test_scan_endpoint_with_domain(mock_crawl, mock_discover, client):
"""POST /scan with just a domain discovers URLs and crawls."""
from src.crawler import CrawlResult, DiscoveredCookie, SiteCrawlResult
mock_discover.return_value = ["https://example.com/"]
mock_crawl.return_value = SiteCrawlResult(
domain="example.com",
pages=[
CrawlResult(
url="https://example.com/",
cookies=[
DiscoveredCookie(
name="_ga",
domain=".example.com",
storage_type="cookie",
page_url="https://example.com/",
value_length=30,
),
DiscoveredCookie(
name="session_id",
domain="example.com",
storage_type="cookie",
page_url="https://example.com/",
value_length=36,
http_only=True,
secure=True,
),
],
),
],
total_cookies_found=2,
)
resp = client.post("/scan", json={"domain": "example.com", "max_pages": 5})
assert resp.status_code == 200
data = resp.json()
assert data["domain"] == "example.com"
assert data["pages_crawled"] == 1
assert data["total_cookies"] == 2
assert len(data["cookies"]) == 2
assert data["cookies"][0]["name"] == "_ga"
assert data["cookies"][1]["name"] == "session_id"
assert data["cookies"][1]["secure"] is True
@patch("src.crawler.CookieCrawler.crawl_site", new_callable=AsyncMock)
def test_scan_endpoint_with_urls(mock_crawl, client):
"""POST /scan with explicit URLs skips URL discovery."""
from src.crawler import CrawlResult, SiteCrawlResult
mock_crawl.return_value = SiteCrawlResult(
domain="example.com",
pages=[CrawlResult(url="https://example.com/about", cookies=[])],
total_cookies_found=0,
)
resp = client.post(
"/scan",
json={
"domain": "example.com",
"urls": ["https://example.com/about"],
"max_pages": 1,
},
)
assert resp.status_code == 200
data = resp.json()
assert data["pages_crawled"] == 1
assert data["cookies"] == []
@patch("src.sitemap.discover_urls", new_callable=AsyncMock)
@patch("src.crawler.CookieCrawler.crawl_site", new_callable=AsyncMock)
def test_scan_endpoint_with_errors(mock_crawl, mock_discover, client):
"""Scan results include page errors."""
from src.crawler import CrawlResult, SiteCrawlResult
mock_discover.return_value = ["https://example.com/"]
mock_crawl.return_value = SiteCrawlResult(
domain="example.com",
pages=[
CrawlResult(url="https://example.com/", cookies=[], error="Timeout"),
],
total_cookies_found=0,
)
resp = client.post("/scan", json={"domain": "example.com"})
assert resp.status_code == 200
data = resp.json()
assert data["errors"] == ["Timeout"]
def test_scan_request_validation(client):
"""Missing domain returns 422."""
resp = client.post("/scan", json={})
assert resp.status_code == 422