feat: initial public release
ConsentOS — a privacy-first cookie consent management platform. Self-hosted, source-available alternative to OneTrust, Cookiebot, and CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant architecture with role-based access, configuration cascade (system → org → group → site → region), dark-pattern detection in the scanner, and a tamper-evident consent record audit trail. This is the initial public release. Prior development history is retained internally. See README.md for the feature list, architecture overview, and quick-start instructions. Licensed under the Elastic Licence 2.0 — self-host freely; do not resell as a managed service.
This commit is contained in:
0
apps/scanner/tests/__init__.py
Normal file
0
apps/scanner/tests/__init__.py
Normal file
144
apps/scanner/tests/test_classifier.py
Normal file
144
apps/scanner/tests/test_classifier.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Tests for cookie classification — CMP-21."""
|
||||
|
||||
from src.classifier import (
|
||||
ClassificationResult,
|
||||
KnownPattern,
|
||||
_domain_matches,
|
||||
classify_cookie,
|
||||
)
|
||||
|
||||
# ── Domain matching ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestDomainMatching:
|
||||
def test_wildcard_matches_any(self):
|
||||
assert _domain_matches("example.com", "*") is True
|
||||
|
||||
def test_exact_match(self):
|
||||
assert _domain_matches("example.com", "example.com") is True
|
||||
|
||||
def test_exact_no_match(self):
|
||||
assert _domain_matches("other.com", "example.com") is False
|
||||
|
||||
def test_subdomain_match(self):
|
||||
assert _domain_matches("sub.example.com", "example.com") is True
|
||||
|
||||
def test_leading_dot_stripped(self):
|
||||
assert _domain_matches(".example.com", "example.com") is True
|
||||
|
||||
def test_pattern_leading_dot(self):
|
||||
assert _domain_matches("example.com", ".example.com") is True
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert _domain_matches("Example.COM", "example.com") is True
|
||||
|
||||
def test_no_partial_match(self):
|
||||
# "notexample.com" should NOT match "example.com"
|
||||
assert _domain_matches("notexample.com", "example.com") is False
|
||||
|
||||
|
||||
# ── Cookie classification ────────────────────────────────────────────
|
||||
|
||||
|
||||
PATTERNS = [
|
||||
KnownPattern(name_pattern="_ga", domain_pattern="*", category="analytics", vendor="Google"),
|
||||
KnownPattern(name_pattern="_ga_*", domain_pattern="*", category="analytics", vendor="Google"),
|
||||
KnownPattern(name_pattern="_gid", domain_pattern="*", category="analytics", vendor="Google"),
|
||||
KnownPattern(
|
||||
name_pattern="_fbp", domain_pattern=".facebook.com", category="marketing", vendor="Meta"
|
||||
),
|
||||
KnownPattern(
|
||||
name_pattern="__cf_bm",
|
||||
domain_pattern="*",
|
||||
category="necessary",
|
||||
vendor="Cloudflare",
|
||||
),
|
||||
KnownPattern(
|
||||
name_pattern="_hj.*",
|
||||
domain_pattern="*",
|
||||
category="analytics",
|
||||
vendor="Hotjar",
|
||||
is_regex=True,
|
||||
),
|
||||
KnownPattern(
|
||||
name_pattern="^_pk_id\\..*",
|
||||
domain_pattern="*",
|
||||
category="analytics",
|
||||
vendor="Matomo",
|
||||
is_regex=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class TestClassifyCookie:
|
||||
def test_exact_match(self):
|
||||
result = classify_cookie("_ga", "example.com", PATTERNS)
|
||||
assert result.category == "analytics"
|
||||
assert result.vendor == "Google"
|
||||
assert result.match_source == "exact"
|
||||
|
||||
def test_wildcard_match(self):
|
||||
result = classify_cookie("_ga_ABC123", "example.com", PATTERNS)
|
||||
assert result.category == "analytics"
|
||||
assert result.match_source == "wildcard"
|
||||
|
||||
def test_regex_match(self):
|
||||
result = classify_cookie("_hjSession_123", "example.com", PATTERNS)
|
||||
assert result.category == "analytics"
|
||||
assert result.vendor == "Hotjar"
|
||||
assert result.match_source == "regex"
|
||||
|
||||
def test_regex_matomo(self):
|
||||
result = classify_cookie("_pk_id.1.abc1", "example.com", PATTERNS)
|
||||
assert result.category == "analytics"
|
||||
assert result.vendor == "Matomo"
|
||||
assert result.match_source == "regex"
|
||||
|
||||
def test_domain_specific_match(self):
|
||||
result = classify_cookie("_fbp", "sub.facebook.com", PATTERNS)
|
||||
assert result.category == "marketing"
|
||||
assert result.vendor == "Meta"
|
||||
|
||||
def test_domain_mismatch(self):
|
||||
result = classify_cookie("_fbp", "example.com", PATTERNS)
|
||||
assert result.category is None
|
||||
assert result.match_source == "unmatched"
|
||||
|
||||
def test_unmatched_cookie(self):
|
||||
result = classify_cookie("unknown_cookie", "example.com", PATTERNS)
|
||||
assert result.category is None
|
||||
assert result.match_source == "unmatched"
|
||||
|
||||
def test_necessary_cookie(self):
|
||||
result = classify_cookie("__cf_bm", "example.com", PATTERNS)
|
||||
assert result.category == "necessary"
|
||||
assert result.vendor == "Cloudflare"
|
||||
|
||||
def test_empty_patterns(self):
|
||||
result = classify_cookie("_ga", "example.com", [])
|
||||
assert result.category is None
|
||||
|
||||
def test_exact_takes_priority_over_wildcard(self):
|
||||
"""Exact match should come before wildcard in pattern list."""
|
||||
patterns = [
|
||||
KnownPattern(name_pattern="_ga", domain_pattern="*", category="analytics"),
|
||||
KnownPattern(name_pattern="_ga*", domain_pattern="*", category="marketing"),
|
||||
]
|
||||
result = classify_cookie("_ga", "example.com", patterns)
|
||||
assert result.category == "analytics"
|
||||
assert result.match_source == "exact"
|
||||
|
||||
|
||||
# ── ClassificationResult ─────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestClassificationResult:
|
||||
def test_defaults(self):
|
||||
r = ClassificationResult(category=None)
|
||||
assert r.vendor is None
|
||||
assert r.match_source == "unmatched"
|
||||
|
||||
def test_with_values(self):
|
||||
r = ClassificationResult(category="analytics", vendor="Google", match_source="exact")
|
||||
assert r.category == "analytics"
|
||||
assert r.vendor == "Google"
|
||||
112
apps/scanner/tests/test_consent_validator.py
Normal file
112
apps/scanner/tests/test_consent_validator.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Tests for consent signal validation — mocks Playwright."""
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.consent_validator import (
|
||||
_is_tracker_request,
|
||||
validate_post_reject,
|
||||
validate_pre_consent,
|
||||
)
|
||||
|
||||
|
||||
class TestIsTrackerRequest:
|
||||
def test_known_tracker(self) -> None:
|
||||
assert _is_tracker_request("https://www.google-analytics.com/collect") is True
|
||||
|
||||
def test_facebook_tracker(self) -> None:
|
||||
assert _is_tracker_request("https://connect.facebook.net/en_US/fbevents.js") is True
|
||||
|
||||
def test_non_tracker(self) -> None:
|
||||
assert _is_tracker_request("https://example.com/style.css") is False
|
||||
|
||||
def test_empty_url(self) -> None:
|
||||
assert _is_tracker_request("") is False
|
||||
|
||||
def test_doubleclick(self) -> None:
|
||||
assert _is_tracker_request("https://ad.doubleclick.net/pixel") is True
|
||||
|
||||
def test_hotjar(self) -> None:
|
||||
assert _is_tracker_request("https://static.hotjar.com/c/hotjar.js") is True
|
||||
|
||||
|
||||
class TestValidatePreConsent:
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_issues_with_only_essential_cookies(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(return_value={"available": False})
|
||||
|
||||
context = AsyncMock()
|
||||
context.cookies = AsyncMock(return_value=[{"name": "session_id", "domain": "example.com"}])
|
||||
|
||||
issues = await validate_pre_consent(page, context, {"session_id"}, [])
|
||||
assert len(issues) == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_essential_cookies_flagged(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(return_value={"available": False})
|
||||
|
||||
context = AsyncMock()
|
||||
context.cookies = AsyncMock(
|
||||
return_value=[
|
||||
{"name": "session_id", "domain": "example.com"},
|
||||
{"name": "_ga", "domain": ".google-analytics.com"},
|
||||
{"name": "_fbp", "domain": ".facebook.com"},
|
||||
]
|
||||
)
|
||||
|
||||
issues = await validate_pre_consent(page, context, {"session_id"}, [])
|
||||
assert len(issues) >= 1
|
||||
cookie_issue = next(i for i in issues if i.check == "pre_consent_cookies")
|
||||
assert cookie_issue.severity == "critical"
|
||||
assert "_ga" in cookie_issue.message
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tracker_requests_flagged(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(return_value={"available": False})
|
||||
|
||||
context = AsyncMock()
|
||||
context.cookies = AsyncMock(return_value=[])
|
||||
|
||||
tracker_urls = ["https://www.google-analytics.com/collect?v=1"]
|
||||
issues = await validate_pre_consent(page, context, set(), tracker_urls)
|
||||
assert len(issues) >= 1
|
||||
tracker_issue = next(i for i in issues if i.check == "pre_consent_trackers")
|
||||
assert tracker_issue.severity == "critical"
|
||||
|
||||
|
||||
class TestValidatePostReject:
|
||||
@pytest.mark.asyncio
|
||||
async def test_clean_rejection(self) -> None:
|
||||
page = AsyncMock()
|
||||
context = AsyncMock()
|
||||
context.cookies = AsyncMock(return_value=[])
|
||||
|
||||
issues = await validate_post_reject(page, context, set(), [])
|
||||
assert len(issues) == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cookies_after_reject_flagged(self) -> None:
|
||||
page = AsyncMock()
|
||||
context = AsyncMock()
|
||||
context.cookies = AsyncMock(
|
||||
return_value=[{"name": "_ga", "domain": ".google-analytics.com"}]
|
||||
)
|
||||
|
||||
issues = await validate_post_reject(page, context, set(), [])
|
||||
assert len(issues) >= 1
|
||||
assert issues[0].check == "post_reject_cookies"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_trackers_after_reject_flagged(self) -> None:
|
||||
page = AsyncMock()
|
||||
context = AsyncMock()
|
||||
context.cookies = AsyncMock(return_value=[])
|
||||
|
||||
tracker_urls = ["https://www.google-analytics.com/collect"]
|
||||
issues = await validate_post_reject(page, context, set(), tracker_urls)
|
||||
assert len(issues) >= 1
|
||||
assert issues[0].check == "post_reject_trackers"
|
||||
440
apps/scanner/tests/test_crawler.py
Normal file
440
apps/scanner/tests/test_crawler.py
Normal file
@@ -0,0 +1,440 @@
|
||||
"""Tests for the Playwright cookie crawler — CMP-21.
|
||||
|
||||
These tests mock Playwright to avoid requiring an actual browser.
|
||||
"""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src.crawler import (
|
||||
CookieCrawler,
|
||||
CrawlResult,
|
||||
DiscoveredCookie,
|
||||
SiteCrawlResult,
|
||||
_build_initiator_chain,
|
||||
_get_script_initiator,
|
||||
)
|
||||
|
||||
# ── Fixtures ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_mock_page(
|
||||
*,
|
||||
cookies: list[dict] | None = None,
|
||||
ls_items: list[dict] | None = None,
|
||||
ss_items: list[dict] | None = None,
|
||||
):
|
||||
"""Build a mock Playwright Page object."""
|
||||
page = AsyncMock()
|
||||
page.goto = AsyncMock()
|
||||
page.on = MagicMock() # synchronous registration
|
||||
|
||||
# page.evaluate returns different results for localStorage vs sessionStorage
|
||||
eval_results = []
|
||||
eval_results.append(ls_items or [])
|
||||
eval_results.append(ss_items or [])
|
||||
page.evaluate = AsyncMock(side_effect=eval_results)
|
||||
|
||||
return page
|
||||
|
||||
|
||||
def _make_mock_context(page, cookies: list[dict] | None = None):
|
||||
"""Build a mock BrowserContext."""
|
||||
context = AsyncMock()
|
||||
context.new_page = AsyncMock(return_value=page)
|
||||
context.cookies = AsyncMock(return_value=cookies or [])
|
||||
context.clear_cookies = AsyncMock()
|
||||
context.close = AsyncMock()
|
||||
return context
|
||||
|
||||
|
||||
def _make_mock_browser(context):
|
||||
"""Build a mock Browser."""
|
||||
browser = AsyncMock()
|
||||
browser.new_context = AsyncMock(return_value=context)
|
||||
browser.close = AsyncMock()
|
||||
return browser
|
||||
|
||||
|
||||
# ── DiscoveredCookie dataclass ──────────────────────────────────────────
|
||||
|
||||
|
||||
class TestDiscoveredCookie:
|
||||
def test_defaults(self):
|
||||
c = DiscoveredCookie(name="_ga", domain="example.com")
|
||||
assert c.storage_type == "cookie"
|
||||
assert c.path is None
|
||||
assert c.expires is None
|
||||
assert c.http_only is None
|
||||
assert c.secure is None
|
||||
assert c.same_site is None
|
||||
assert c.value_length == 0
|
||||
assert c.script_source is None
|
||||
assert c.page_url == ""
|
||||
|
||||
def test_initiator_chain_defaults_to_empty(self):
|
||||
c = DiscoveredCookie(name="_ga", domain="example.com")
|
||||
assert c.initiator_chain == []
|
||||
|
||||
def test_with_all_fields(self):
|
||||
c = DiscoveredCookie(
|
||||
name="_ga",
|
||||
domain=".example.com",
|
||||
storage_type="cookie",
|
||||
path="/",
|
||||
expires=1700000000.0,
|
||||
http_only=True,
|
||||
secure=True,
|
||||
same_site="Lax",
|
||||
value_length=42,
|
||||
script_source="https://cdn.example.com/tracker.js",
|
||||
page_url="https://example.com/",
|
||||
initiator_chain=["https://example.com/", "https://cdn.example.com/tracker.js"],
|
||||
)
|
||||
assert c.http_only is True
|
||||
assert c.value_length == 42
|
||||
assert len(c.initiator_chain) == 2
|
||||
|
||||
|
||||
# ── CrawlResult dataclass ──────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestCrawlResult:
|
||||
def test_defaults(self):
|
||||
r = CrawlResult(url="https://example.com/")
|
||||
assert r.cookies == []
|
||||
assert r.error is None
|
||||
|
||||
def test_with_error(self):
|
||||
r = CrawlResult(url="https://example.com/", error="Timeout")
|
||||
assert r.error == "Timeout"
|
||||
|
||||
|
||||
# ── SiteCrawlResult ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSiteCrawlResult:
|
||||
def test_unique_cookies_deduplicates(self):
|
||||
cookie_a = DiscoveredCookie(name="_ga", domain="example.com", storage_type="cookie")
|
||||
cookie_b = DiscoveredCookie(name="_ga", domain="example.com", storage_type="cookie")
|
||||
cookie_c = DiscoveredCookie(name="_gid", domain="example.com", storage_type="cookie")
|
||||
|
||||
result = SiteCrawlResult(
|
||||
domain="example.com",
|
||||
pages=[
|
||||
CrawlResult(url="https://example.com/", cookies=[cookie_a, cookie_c]),
|
||||
CrawlResult(url="https://example.com/about", cookies=[cookie_b]),
|
||||
],
|
||||
total_cookies_found=3,
|
||||
)
|
||||
|
||||
unique = result.unique_cookies
|
||||
assert len(unique) == 2
|
||||
names = {c.name for c in unique}
|
||||
assert names == {"_ga", "_gid"}
|
||||
|
||||
def test_unique_cookies_separates_storage_types(self):
|
||||
"""Same name in cookie vs localStorage should be separate entries."""
|
||||
cookie = DiscoveredCookie(name="token", domain="example.com", storage_type="cookie")
|
||||
ls = DiscoveredCookie(name="token", domain="example.com", storage_type="local_storage")
|
||||
|
||||
result = SiteCrawlResult(
|
||||
domain="example.com",
|
||||
pages=[CrawlResult(url="https://example.com/", cookies=[cookie, ls])],
|
||||
total_cookies_found=2,
|
||||
)
|
||||
|
||||
assert len(result.unique_cookies) == 2
|
||||
|
||||
def test_empty_pages(self):
|
||||
result = SiteCrawlResult(domain="example.com")
|
||||
assert result.unique_cookies == []
|
||||
|
||||
|
||||
# ── _get_script_initiator ──────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestGetScriptInitiator:
|
||||
def test_identifies_js_url(self):
|
||||
request = MagicMock()
|
||||
request.url = "https://cdn.example.com/tracker.js"
|
||||
request.resource_type = "script"
|
||||
request.redirected_from = None
|
||||
|
||||
assert _get_script_initiator(request) == "https://cdn.example.com/tracker.js"
|
||||
|
||||
def test_follows_redirect_chain(self):
|
||||
original = MagicMock()
|
||||
original.url = "https://cdn.example.com/analytics.js"
|
||||
original.resource_type = "script"
|
||||
original.redirected_from = None
|
||||
|
||||
redirect = MagicMock()
|
||||
redirect.url = "https://example.com/track"
|
||||
redirect.resource_type = "fetch"
|
||||
redirect.redirected_from = original
|
||||
|
||||
assert _get_script_initiator(redirect) == "https://cdn.example.com/analytics.js"
|
||||
|
||||
def test_returns_none_for_non_script(self):
|
||||
request = MagicMock()
|
||||
request.url = "https://example.com/image.png"
|
||||
request.resource_type = "image"
|
||||
request.redirected_from = None
|
||||
|
||||
assert _get_script_initiator(request) is None
|
||||
|
||||
def test_handles_javascript_resource_type(self):
|
||||
request = MagicMock()
|
||||
request.url = "https://example.com/bundle"
|
||||
request.resource_type = "javascript"
|
||||
request.redirected_from = None
|
||||
|
||||
assert _get_script_initiator(request) == "https://example.com/bundle"
|
||||
|
||||
def test_handles_circular_redirect(self):
|
||||
"""Should not loop infinitely on circular redirects."""
|
||||
req_a = MagicMock()
|
||||
req_a.url = "https://example.com/a"
|
||||
req_a.resource_type = "fetch"
|
||||
|
||||
req_b = MagicMock()
|
||||
req_b.url = "https://example.com/b"
|
||||
req_b.resource_type = "fetch"
|
||||
|
||||
# Create circular chain
|
||||
req_a.redirected_from = req_b
|
||||
req_b.redirected_from = req_a
|
||||
|
||||
# Should not hang — returns None since neither is a script
|
||||
result = _get_script_initiator(req_a)
|
||||
assert result is None
|
||||
|
||||
|
||||
# ── _build_initiator_chain ────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestBuildInitiatorChain:
|
||||
def test_single_url_no_parent(self):
|
||||
chain = _build_initiator_chain("https://example.com/script.js", {})
|
||||
assert chain == ["https://example.com/script.js"]
|
||||
|
||||
def test_two_level_chain(self):
|
||||
imap = {"https://cdn.example.com/tracker.js": "https://example.com/"}
|
||||
chain = _build_initiator_chain("https://cdn.example.com/tracker.js", imap)
|
||||
assert chain == ["https://example.com/", "https://cdn.example.com/tracker.js"]
|
||||
|
||||
def test_three_level_chain(self):
|
||||
imap = {
|
||||
"https://cdn.example.com/pixel.js": "https://cdn.example.com/gtm.js",
|
||||
"https://cdn.example.com/gtm.js": "https://example.com/",
|
||||
}
|
||||
chain = _build_initiator_chain("https://cdn.example.com/pixel.js", imap)
|
||||
assert chain == [
|
||||
"https://example.com/",
|
||||
"https://cdn.example.com/gtm.js",
|
||||
"https://cdn.example.com/pixel.js",
|
||||
]
|
||||
|
||||
def test_respects_max_depth(self):
|
||||
# Build a chain longer than max_depth
|
||||
imap = {}
|
||||
for i in range(25):
|
||||
imap[f"https://example.com/s{i + 1}.js"] = f"https://example.com/s{i}.js"
|
||||
chain = _build_initiator_chain("https://example.com/s25.js", imap, max_depth=5)
|
||||
# Should be capped: the leaf + 5 parents = 6 entries at most
|
||||
assert len(chain) <= 6
|
||||
|
||||
def test_handles_circular_reference(self):
|
||||
imap = {
|
||||
"https://a.com/a.js": "https://b.com/b.js",
|
||||
"https://b.com/b.js": "https://a.com/a.js",
|
||||
}
|
||||
chain = _build_initiator_chain("https://a.com/a.js", imap)
|
||||
# Should not loop — cycle detected via seen set
|
||||
assert len(chain) == 2
|
||||
|
||||
|
||||
# ── CookieCrawler._crawl_page ──────────────────────────────────────────
|
||||
|
||||
|
||||
class TestCrawlPage:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_discovers_browser_cookies(self):
|
||||
cdp_cookies = [
|
||||
{
|
||||
"name": "_ga",
|
||||
"domain": ".example.com",
|
||||
"path": "/",
|
||||
"expires": 1700000000,
|
||||
"httpOnly": False,
|
||||
"secure": True,
|
||||
"sameSite": "Lax",
|
||||
"value": "GA1.2.12345",
|
||||
}
|
||||
]
|
||||
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page, cookies=cdp_cookies)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
assert len(result.cookies) == 1
|
||||
assert result.cookies[0].name == "_ga"
|
||||
assert result.cookies[0].domain == ".example.com"
|
||||
assert result.cookies[0].storage_type == "cookie"
|
||||
assert result.cookies[0].secure is True
|
||||
assert result.cookies[0].value_length == len("GA1.2.12345")
|
||||
assert result.error is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_discovers_local_storage(self):
|
||||
ls_items = [{"name": "theme", "valueLength": 4}]
|
||||
|
||||
page = _make_mock_page(ls_items=ls_items)
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
ls_cookies = [c for c in result.cookies if c.storage_type == "local_storage"]
|
||||
assert len(ls_cookies) == 1
|
||||
assert ls_cookies[0].name == "theme"
|
||||
assert ls_cookies[0].value_length == 4
|
||||
assert ls_cookies[0].domain == "example.com"
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_discovers_session_storage(self):
|
||||
ss_items = [{"name": "session_id", "valueLength": 36}]
|
||||
|
||||
page = _make_mock_page(ss_items=ss_items)
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
ss_cookies = [c for c in result.cookies if c.storage_type == "session_storage"]
|
||||
assert len(ss_cookies) == 1
|
||||
assert ss_cookies[0].name == "session_id"
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_handles_page_error(self):
|
||||
page = _make_mock_page()
|
||||
page.goto = AsyncMock(side_effect=Exception("Navigation timeout"))
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
assert result.error == "Navigation timeout"
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_context_closed_after_crawl(self):
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
context.close.assert_awaited_once()
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_context_closed_on_error(self):
|
||||
page = _make_mock_page()
|
||||
page.goto = AsyncMock(side_effect=Exception("fail"))
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
context.close.assert_awaited_once()
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_custom_user_agent(self):
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
crawler = CookieCrawler(user_agent="CMPBot/1.0")
|
||||
await crawler._crawl_page(browser, "https://example.com/")
|
||||
|
||||
browser.new_context.assert_awaited_once()
|
||||
call_kwargs = browser.new_context.call_args[1]
|
||||
assert call_kwargs["user_agent"] == "CMPBot/1.0"
|
||||
|
||||
|
||||
# ── CookieCrawler.crawl_site ───────────────────────────────────────────
|
||||
|
||||
|
||||
class TestCrawlSite:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.crawler.async_playwright")
|
||||
async def test_crawls_multiple_pages(self, mock_pw):
|
||||
cdp_cookies = [{"name": "_ga", "domain": ".example.com", "value": "x"}]
|
||||
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page, cookies=cdp_cookies)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
pw_instance = AsyncMock()
|
||||
pw_instance.chromium.launch = AsyncMock(return_value=browser)
|
||||
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
|
||||
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler.crawl_site(["https://example.com/", "https://example.com/about"])
|
||||
|
||||
assert result.domain == "example.com"
|
||||
assert len(result.pages) == 2
|
||||
assert result.total_cookies_found >= 2
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.crawler.async_playwright")
|
||||
async def test_respects_max_pages(self, mock_pw):
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
pw_instance = AsyncMock()
|
||||
pw_instance.chromium.launch = AsyncMock(return_value=browser)
|
||||
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
|
||||
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
urls = [f"https://example.com/page{i}" for i in range(10)]
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler.crawl_site(urls, max_pages=3)
|
||||
|
||||
assert len(result.pages) == 3
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_empty_urls(self):
|
||||
crawler = CookieCrawler()
|
||||
result = await crawler.crawl_site([])
|
||||
|
||||
assert result.domain == ""
|
||||
assert result.pages == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.crawler.async_playwright")
|
||||
async def test_browser_closed_after_crawl(self, mock_pw):
|
||||
page = _make_mock_page()
|
||||
context = _make_mock_context(page)
|
||||
browser = _make_mock_browser(context)
|
||||
|
||||
pw_instance = AsyncMock()
|
||||
pw_instance.chromium.launch = AsyncMock(return_value=browser)
|
||||
mock_pw.return_value.__aenter__ = AsyncMock(return_value=pw_instance)
|
||||
mock_pw.return_value.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
crawler = CookieCrawler()
|
||||
await crawler.crawl_site(["https://example.com/"])
|
||||
|
||||
browser.close.assert_awaited_once()
|
||||
100
apps/scanner/tests/test_crawler_proxy.py
Normal file
100
apps/scanner/tests/test_crawler_proxy.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Tests for crawler proxy configuration.
|
||||
|
||||
Mocks Playwright to avoid requiring an actual browser installation.
|
||||
"""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src.crawler import CookieCrawler, ProxyConfig
|
||||
|
||||
|
||||
class TestProxyConfig:
|
||||
"""Tests for ProxyConfig dataclass."""
|
||||
|
||||
def test_proxy_config_creation(self) -> None:
|
||||
proxy = ProxyConfig(server="http://proxy.example.com:8080")
|
||||
assert proxy.server == "http://proxy.example.com:8080"
|
||||
assert proxy.username is None
|
||||
assert proxy.password is None
|
||||
|
||||
def test_proxy_config_with_auth(self) -> None:
|
||||
proxy = ProxyConfig(
|
||||
server="http://proxy.example.com:8080",
|
||||
username="user",
|
||||
password="pass",
|
||||
)
|
||||
assert proxy.username == "user"
|
||||
assert proxy.password == "pass"
|
||||
|
||||
|
||||
class TestCookieCrawlerProxy:
|
||||
"""Tests for CookieCrawler proxy support."""
|
||||
|
||||
def test_crawler_without_proxy(self) -> None:
|
||||
crawler = CookieCrawler(headless=True)
|
||||
assert crawler._proxy is None
|
||||
|
||||
def test_crawler_with_proxy(self) -> None:
|
||||
proxy = ProxyConfig(server="http://proxy.example.com:8080")
|
||||
crawler = CookieCrawler(headless=True, proxy=proxy)
|
||||
assert crawler._proxy is not None
|
||||
assert crawler._proxy.server == "http://proxy.example.com:8080"
|
||||
|
||||
def test_crawler_with_socks5_proxy(self) -> None:
|
||||
proxy = ProxyConfig(server="socks5://proxy.example.com:1080")
|
||||
crawler = CookieCrawler(headless=True, proxy=proxy)
|
||||
assert crawler._proxy.server == "socks5://proxy.example.com:1080"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_passes_proxy_to_browser(self) -> None:
|
||||
"""Verify that proxy config is passed to Playwright launch."""
|
||||
proxy = ProxyConfig(
|
||||
server="http://proxy.example.com:8080",
|
||||
username="user",
|
||||
password="pass",
|
||||
)
|
||||
crawler = CookieCrawler(headless=True, proxy=proxy)
|
||||
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.close = AsyncMock()
|
||||
|
||||
mock_pw = MagicMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
mock_context_manager = AsyncMock()
|
||||
mock_context_manager.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_context_manager.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("src.crawler.async_playwright", return_value=mock_context_manager):
|
||||
await crawler.crawl_site(["https://example.com/"], max_pages=1)
|
||||
|
||||
# Verify proxy was passed to browser launch
|
||||
mock_pw.chromium.launch.assert_called_once()
|
||||
call_kwargs = mock_pw.chromium.launch.call_args[1]
|
||||
assert "proxy" in call_kwargs
|
||||
assert call_kwargs["proxy"]["server"] == "http://proxy.example.com:8080"
|
||||
assert call_kwargs["proxy"]["username"] == "user"
|
||||
assert call_kwargs["proxy"]["password"] == "pass"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_without_proxy_omits_proxy_kwarg(self) -> None:
|
||||
"""Verify that no proxy is passed when none is configured."""
|
||||
crawler = CookieCrawler(headless=True)
|
||||
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.close = AsyncMock()
|
||||
|
||||
mock_pw = MagicMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
mock_context_manager = AsyncMock()
|
||||
mock_context_manager.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_context_manager.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("src.crawler.async_playwright", return_value=mock_context_manager):
|
||||
await crawler.crawl_site(["https://example.com/"], max_pages=1)
|
||||
|
||||
call_kwargs = mock_pw.chromium.launch.call_args[1]
|
||||
assert "proxy" not in call_kwargs
|
||||
153
apps/scanner/tests/test_dark_pattern_detector.py
Normal file
153
apps/scanner/tests/test_dark_pattern_detector.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""Tests for dark pattern detection — mocks Playwright."""
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.dark_pattern_detector import (
|
||||
check_button_prominence,
|
||||
check_cookie_wall,
|
||||
check_pre_ticked_boxes,
|
||||
detect_dark_patterns,
|
||||
)
|
||||
|
||||
|
||||
class TestCheckButtonProminence:
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_accept_button_returns_empty(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.query_selector_all = AsyncMock(return_value=[])
|
||||
issues = await check_button_prominence(page)
|
||||
assert issues == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_reject_button_flagged(self) -> None:
|
||||
# Accept button visible, reject not found
|
||||
accept_el = AsyncMock()
|
||||
accept_el.is_visible = AsyncMock(return_value=True)
|
||||
accept_el.evaluate = AsyncMock(
|
||||
return_value={
|
||||
"width": 200,
|
||||
"height": 40,
|
||||
"area": 8000,
|
||||
"backgroundColor": "rgb(37, 99, 235)",
|
||||
"color": "rgb(255, 255, 255)",
|
||||
"fontSize": 16,
|
||||
"fontWeight": "600",
|
||||
"padding": "8px 16px",
|
||||
"text": "Accept All",
|
||||
"visible": True,
|
||||
}
|
||||
)
|
||||
|
||||
call_count = 0
|
||||
|
||||
async def _mock_query(selector):
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
# First batch of calls = accept selectors, return button
|
||||
# Remaining calls = reject selectors, return empty
|
||||
if "Accept" in selector or "Allow" in selector or "accept" in selector:
|
||||
return [accept_el]
|
||||
return []
|
||||
|
||||
page = AsyncMock()
|
||||
page.query_selector_all = _mock_query
|
||||
|
||||
issues = await check_button_prominence(page)
|
||||
assert any(i.pattern == "missing_reject_button" for i in issues)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unequal_button_size_flagged(self) -> None:
|
||||
accept_el = AsyncMock()
|
||||
accept_el.is_visible = AsyncMock(return_value=True)
|
||||
accept_el.evaluate = AsyncMock(
|
||||
return_value={
|
||||
"width": 300,
|
||||
"height": 50,
|
||||
"area": 15000,
|
||||
"fontSize": 18,
|
||||
"fontWeight": "700",
|
||||
"text": "Accept All",
|
||||
"visible": True,
|
||||
}
|
||||
)
|
||||
|
||||
reject_el = AsyncMock()
|
||||
reject_el.is_visible = AsyncMock(return_value=True)
|
||||
reject_el.evaluate = AsyncMock(
|
||||
return_value={
|
||||
"width": 100,
|
||||
"height": 30,
|
||||
"area": 3000,
|
||||
"fontSize": 12,
|
||||
"fontWeight": "400",
|
||||
"text": "Reject",
|
||||
"visible": True,
|
||||
}
|
||||
)
|
||||
|
||||
async def _mock_query(selector):
|
||||
if "Accept" in selector or "Allow" in selector or "accept" in selector:
|
||||
return [accept_el]
|
||||
if "Reject" in selector or "Decline" in selector or "reject" in selector:
|
||||
return [reject_el]
|
||||
return []
|
||||
|
||||
page = AsyncMock()
|
||||
page.query_selector_all = _mock_query
|
||||
|
||||
issues = await check_button_prominence(page)
|
||||
assert any(i.pattern == "unequal_button_size" for i in issues)
|
||||
|
||||
|
||||
class TestCheckPreTickedBoxes:
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_pre_ticked_returns_empty(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(return_value=[])
|
||||
issues = await check_pre_ticked_boxes(page)
|
||||
assert issues == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pre_ticked_non_essential_flagged(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(
|
||||
return_value=[
|
||||
{"name": "analytics", "label": "Analytics Cookies"},
|
||||
{"name": "marketing", "label": "Marketing Cookies"},
|
||||
]
|
||||
)
|
||||
issues = await check_pre_ticked_boxes(page)
|
||||
assert len(issues) == 1
|
||||
assert issues[0].pattern == "pre_ticked_checkboxes"
|
||||
assert issues[0].severity == "critical"
|
||||
|
||||
|
||||
class TestCheckCookieWall:
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_wall_returns_empty(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(return_value=False)
|
||||
issues = await check_cookie_wall(page)
|
||||
assert issues == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_wall_detected(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.evaluate = AsyncMock(return_value=True)
|
||||
issues = await check_cookie_wall(page)
|
||||
assert len(issues) == 1
|
||||
assert issues[0].pattern == "cookie_wall"
|
||||
assert issues[0].severity == "critical"
|
||||
|
||||
|
||||
class TestDetectDarkPatterns:
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_banner_returns_empty(self) -> None:
|
||||
page = AsyncMock()
|
||||
page.url = "https://example.com/"
|
||||
page.query_selector_all = AsyncMock(return_value=[])
|
||||
result = await detect_dark_patterns(page)
|
||||
assert result.banner_found is False
|
||||
assert result.issues == []
|
||||
275
apps/scanner/tests/test_sitemap.py
Normal file
275
apps/scanner/tests/test_sitemap.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Tests for sitemap URL discovery — CMP-21."""
|
||||
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from src.sitemap import _fetch_sitemap, _find_sitemap_in_robots, discover_urls
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_response(status_code: int = 200, text: str = "") -> httpx.Response:
|
||||
"""Build a fake httpx.Response."""
|
||||
return httpx.Response(
|
||||
status_code=status_code, text=text, request=httpx.Request("GET", "http://x")
|
||||
)
|
||||
|
||||
|
||||
SITEMAP_XML = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page1</loc></url>
|
||||
<url><loc>https://example.com/page2</loc></url>
|
||||
<url><loc>https://example.com/page3</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
|
||||
SITEMAP_INDEX_XML = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap><loc>https://example.com/sitemap-main.xml</loc></sitemap>
|
||||
<sitemap><loc>https://example.com/sitemap-blog.xml</loc></sitemap>
|
||||
</sitemapindex>
|
||||
"""
|
||||
|
||||
CHILD_SITEMAP_XML = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/blog/post1</loc></url>
|
||||
<url><loc>https://example.com/blog/post2</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
|
||||
ROBOTS_TXT_WITH_SITEMAP = """\
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
Sitemap: https://example.com/custom-sitemap.xml
|
||||
"""
|
||||
|
||||
ROBOTS_TXT_NO_SITEMAP = """\
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
"""
|
||||
|
||||
|
||||
# ── _fetch_sitemap ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFetchSitemap:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_parses_regular_sitemap(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3",
|
||||
]
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_respects_max_urls(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 2)
|
||||
|
||||
assert len(urls) == 2
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_handles_sitemap_index(self):
|
||||
"""Sitemap index should recursively fetch child sitemaps."""
|
||||
responses = {
|
||||
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
|
||||
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
|
||||
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
|
||||
}
|
||||
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=lambda url: responses[url])
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert len(urls) == 5
|
||||
assert "https://example.com/page1" in urls
|
||||
assert "https://example.com/blog/post1" in urls
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_sitemap_index_respects_max_urls(self):
|
||||
"""Should stop fetching child sitemaps once max_urls is reached."""
|
||||
responses = {
|
||||
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
|
||||
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
|
||||
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
|
||||
}
|
||||
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=lambda url: responses[url])
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 3)
|
||||
|
||||
assert len(urls) == 3
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_empty_on_404(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(404))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_empty_on_invalid_xml(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, "not xml at all"))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_empty_on_network_error(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_empty_urlset(self):
|
||||
empty_sitemap = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
</urlset>
|
||||
"""
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, empty_sitemap))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
|
||||
# ── _find_sitemap_in_robots ────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFindSitemapInRobots:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_finds_sitemap_directive(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_WITH_SITEMAP))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url == "https://example.com/custom-sitemap.xml"
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_none_when_no_directive(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_NO_SITEMAP))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_none_on_404(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(404))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_none_on_network_error(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_case_insensitive_directive(self):
|
||||
robots = "User-agent: *\nsITEMAP: https://example.com/sm.xml\n"
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, robots))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url == "https://example.com/sm.xml"
|
||||
|
||||
|
||||
# ── discover_urls ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestDiscoverUrls:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_returns_sitemap_urls(self, mock_robots, mock_sitemap):
|
||||
"""Should return URLs from /sitemap.xml when available."""
|
||||
mock_sitemap.return_value = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
]
|
||||
|
||||
urls = await discover_urls("example.com")
|
||||
|
||||
assert urls == ["https://example.com/page1", "https://example.com/page2"]
|
||||
mock_robots.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_falls_back_to_robots_txt(self, mock_robots, mock_sitemap):
|
||||
"""When sitemap.xml returns nothing, should try robots.txt."""
|
||||
mock_sitemap.side_effect = [[], ["https://example.com/from-robots"]]
|
||||
mock_robots.return_value = "https://example.com/alt-sitemap.xml"
|
||||
|
||||
urls = await discover_urls("example.com")
|
||||
|
||||
assert urls == ["https://example.com/from-robots"]
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_falls_back_to_default_paths(self, mock_robots, mock_sitemap):
|
||||
"""When no sitemap exists, should return default paths."""
|
||||
mock_sitemap.return_value = []
|
||||
mock_robots.return_value = None
|
||||
|
||||
urls = await discover_urls("example.com")
|
||||
|
||||
assert "https://example.com/" in urls
|
||||
assert "https://example.com/privacy" in urls
|
||||
assert "https://example.com/cookie-policy" in urls
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_respects_max_urls(self, mock_robots, mock_sitemap):
|
||||
many_urls = [f"https://example.com/page{i}" for i in range(100)]
|
||||
mock_sitemap.return_value = many_urls
|
||||
|
||||
urls = await discover_urls("example.com", max_urls=5)
|
||||
|
||||
assert len(urls) == 5
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_default_paths_respect_max_urls(self, mock_robots, mock_sitemap):
|
||||
mock_sitemap.return_value = []
|
||||
mock_robots.return_value = None
|
||||
|
||||
urls = await discover_urls("example.com", max_urls=3)
|
||||
|
||||
assert len(urls) == 3
|
||||
122
apps/scanner/tests/test_worker.py
Normal file
122
apps/scanner/tests/test_worker.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Tests for the scanner HTTP service."""
|
||||
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from src.worker import create_app
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
"""Create a test client for the scanner app."""
|
||||
app = create_app()
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
def test_health_endpoint(client):
|
||||
"""Health endpoint returns ok."""
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == {"status": "ok"}
|
||||
|
||||
|
||||
@patch("src.sitemap.discover_urls", new_callable=AsyncMock)
|
||||
@patch("src.crawler.CookieCrawler.crawl_site", new_callable=AsyncMock)
|
||||
def test_scan_endpoint_with_domain(mock_crawl, mock_discover, client):
|
||||
"""POST /scan with just a domain discovers URLs and crawls."""
|
||||
from src.crawler import CrawlResult, DiscoveredCookie, SiteCrawlResult
|
||||
|
||||
mock_discover.return_value = ["https://example.com/"]
|
||||
mock_crawl.return_value = SiteCrawlResult(
|
||||
domain="example.com",
|
||||
pages=[
|
||||
CrawlResult(
|
||||
url="https://example.com/",
|
||||
cookies=[
|
||||
DiscoveredCookie(
|
||||
name="_ga",
|
||||
domain=".example.com",
|
||||
storage_type="cookie",
|
||||
page_url="https://example.com/",
|
||||
value_length=30,
|
||||
),
|
||||
DiscoveredCookie(
|
||||
name="session_id",
|
||||
domain="example.com",
|
||||
storage_type="cookie",
|
||||
page_url="https://example.com/",
|
||||
value_length=36,
|
||||
http_only=True,
|
||||
secure=True,
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
total_cookies_found=2,
|
||||
)
|
||||
|
||||
resp = client.post("/scan", json={"domain": "example.com", "max_pages": 5})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
|
||||
assert data["domain"] == "example.com"
|
||||
assert data["pages_crawled"] == 1
|
||||
assert data["total_cookies"] == 2
|
||||
assert len(data["cookies"]) == 2
|
||||
assert data["cookies"][0]["name"] == "_ga"
|
||||
assert data["cookies"][1]["name"] == "session_id"
|
||||
assert data["cookies"][1]["secure"] is True
|
||||
|
||||
|
||||
@patch("src.crawler.CookieCrawler.crawl_site", new_callable=AsyncMock)
|
||||
def test_scan_endpoint_with_urls(mock_crawl, client):
|
||||
"""POST /scan with explicit URLs skips URL discovery."""
|
||||
from src.crawler import CrawlResult, SiteCrawlResult
|
||||
|
||||
mock_crawl.return_value = SiteCrawlResult(
|
||||
domain="example.com",
|
||||
pages=[CrawlResult(url="https://example.com/about", cookies=[])],
|
||||
total_cookies_found=0,
|
||||
)
|
||||
|
||||
resp = client.post(
|
||||
"/scan",
|
||||
json={
|
||||
"domain": "example.com",
|
||||
"urls": ["https://example.com/about"],
|
||||
"max_pages": 1,
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["pages_crawled"] == 1
|
||||
assert data["cookies"] == []
|
||||
|
||||
|
||||
@patch("src.sitemap.discover_urls", new_callable=AsyncMock)
|
||||
@patch("src.crawler.CookieCrawler.crawl_site", new_callable=AsyncMock)
|
||||
def test_scan_endpoint_with_errors(mock_crawl, mock_discover, client):
|
||||
"""Scan results include page errors."""
|
||||
from src.crawler import CrawlResult, SiteCrawlResult
|
||||
|
||||
mock_discover.return_value = ["https://example.com/"]
|
||||
mock_crawl.return_value = SiteCrawlResult(
|
||||
domain="example.com",
|
||||
pages=[
|
||||
CrawlResult(url="https://example.com/", cookies=[], error="Timeout"),
|
||||
],
|
||||
total_cookies_found=0,
|
||||
)
|
||||
|
||||
resp = client.post("/scan", json={"domain": "example.com"})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["errors"] == ["Timeout"]
|
||||
|
||||
|
||||
def test_scan_request_validation(client):
|
||||
"""Missing domain returns 422."""
|
||||
resp = client.post("/scan", json={})
|
||||
assert resp.status_code == 422
|
||||
Reference in New Issue
Block a user