feat: initial public release
ConsentOS — a privacy-first cookie consent management platform. Self-hosted, source-available alternative to OneTrust, Cookiebot, and CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant architecture with role-based access, configuration cascade (system → org → group → site → region), dark-pattern detection in the scanner, and a tamper-evident consent record audit trail. This is the initial public release. Prior development history is retained internally. See README.md for the feature list, architecture overview, and quick-start instructions. Licensed under the Elastic Licence 2.0 — self-host freely; do not resell as a managed service.
This commit is contained in:
275
apps/scanner/tests/test_sitemap.py
Normal file
275
apps/scanner/tests/test_sitemap.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Tests for sitemap URL discovery — CMP-21."""
|
||||
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from src.sitemap import _fetch_sitemap, _find_sitemap_in_robots, discover_urls
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_response(status_code: int = 200, text: str = "") -> httpx.Response:
|
||||
"""Build a fake httpx.Response."""
|
||||
return httpx.Response(
|
||||
status_code=status_code, text=text, request=httpx.Request("GET", "http://x")
|
||||
)
|
||||
|
||||
|
||||
SITEMAP_XML = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page1</loc></url>
|
||||
<url><loc>https://example.com/page2</loc></url>
|
||||
<url><loc>https://example.com/page3</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
|
||||
SITEMAP_INDEX_XML = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap><loc>https://example.com/sitemap-main.xml</loc></sitemap>
|
||||
<sitemap><loc>https://example.com/sitemap-blog.xml</loc></sitemap>
|
||||
</sitemapindex>
|
||||
"""
|
||||
|
||||
CHILD_SITEMAP_XML = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/blog/post1</loc></url>
|
||||
<url><loc>https://example.com/blog/post2</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
|
||||
ROBOTS_TXT_WITH_SITEMAP = """\
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
Sitemap: https://example.com/custom-sitemap.xml
|
||||
"""
|
||||
|
||||
ROBOTS_TXT_NO_SITEMAP = """\
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
"""
|
||||
|
||||
|
||||
# ── _fetch_sitemap ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFetchSitemap:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_parses_regular_sitemap(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3",
|
||||
]
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_respects_max_urls(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 2)
|
||||
|
||||
assert len(urls) == 2
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_handles_sitemap_index(self):
|
||||
"""Sitemap index should recursively fetch child sitemaps."""
|
||||
responses = {
|
||||
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
|
||||
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
|
||||
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
|
||||
}
|
||||
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=lambda url: responses[url])
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert len(urls) == 5
|
||||
assert "https://example.com/page1" in urls
|
||||
assert "https://example.com/blog/post1" in urls
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_sitemap_index_respects_max_urls(self):
|
||||
"""Should stop fetching child sitemaps once max_urls is reached."""
|
||||
responses = {
|
||||
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
|
||||
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
|
||||
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
|
||||
}
|
||||
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=lambda url: responses[url])
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 3)
|
||||
|
||||
assert len(urls) == 3
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_empty_on_404(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(404))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_empty_on_invalid_xml(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, "not xml at all"))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_empty_on_network_error(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_empty_urlset(self):
|
||||
empty_sitemap = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
</urlset>
|
||||
"""
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, empty_sitemap))
|
||||
|
||||
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
||||
|
||||
assert urls == []
|
||||
|
||||
|
||||
# ── _find_sitemap_in_robots ────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFindSitemapInRobots:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_finds_sitemap_directive(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_WITH_SITEMAP))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url == "https://example.com/custom-sitemap.xml"
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_none_when_no_directive(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_NO_SITEMAP))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_none_on_404(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(404))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_returns_none_on_network_error(self):
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url is None
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_case_insensitive_directive(self):
|
||||
robots = "User-agent: *\nsITEMAP: https://example.com/sm.xml\n"
|
||||
client = AsyncMock(spec=httpx.AsyncClient)
|
||||
client.get = AsyncMock(return_value=_make_response(200, robots))
|
||||
|
||||
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
||||
|
||||
assert url == "https://example.com/sm.xml"
|
||||
|
||||
|
||||
# ── discover_urls ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestDiscoverUrls:
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_returns_sitemap_urls(self, mock_robots, mock_sitemap):
|
||||
"""Should return URLs from /sitemap.xml when available."""
|
||||
mock_sitemap.return_value = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
]
|
||||
|
||||
urls = await discover_urls("example.com")
|
||||
|
||||
assert urls == ["https://example.com/page1", "https://example.com/page2"]
|
||||
mock_robots.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_falls_back_to_robots_txt(self, mock_robots, mock_sitemap):
|
||||
"""When sitemap.xml returns nothing, should try robots.txt."""
|
||||
mock_sitemap.side_effect = [[], ["https://example.com/from-robots"]]
|
||||
mock_robots.return_value = "https://example.com/alt-sitemap.xml"
|
||||
|
||||
urls = await discover_urls("example.com")
|
||||
|
||||
assert urls == ["https://example.com/from-robots"]
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_falls_back_to_default_paths(self, mock_robots, mock_sitemap):
|
||||
"""When no sitemap exists, should return default paths."""
|
||||
mock_sitemap.return_value = []
|
||||
mock_robots.return_value = None
|
||||
|
||||
urls = await discover_urls("example.com")
|
||||
|
||||
assert "https://example.com/" in urls
|
||||
assert "https://example.com/privacy" in urls
|
||||
assert "https://example.com/cookie-policy" in urls
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_respects_max_urls(self, mock_robots, mock_sitemap):
|
||||
many_urls = [f"https://example.com/page{i}" for i in range(100)]
|
||||
mock_sitemap.return_value = many_urls
|
||||
|
||||
urls = await discover_urls("example.com", max_urls=5)
|
||||
|
||||
assert len(urls) == 5
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@patch("src.sitemap._fetch_sitemap")
|
||||
@patch("src.sitemap._find_sitemap_in_robots")
|
||||
async def test_default_paths_respect_max_urls(self, mock_robots, mock_sitemap):
|
||||
mock_sitemap.return_value = []
|
||||
mock_robots.return_value = None
|
||||
|
||||
urls = await discover_urls("example.com", max_urls=3)
|
||||
|
||||
assert len(urls) == 3
|
||||
Reference in New Issue
Block a user