Files
consentos/apps/scanner/tests/test_sitemap.py
James Cottrill fbf26453f2 feat: initial public release
ConsentOS — a privacy-first cookie consent management platform.

Self-hosted, source-available alternative to OneTrust, Cookiebot, and
CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google
Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant
architecture with role-based access, configuration cascade
(system → org → group → site → region), dark-pattern detection in
the scanner, and a tamper-evident consent record audit trail.

This is the initial public release. Prior development history is
retained internally.

See README.md for the feature list, architecture overview, and
quick-start instructions. Licensed under the Elastic Licence 2.0 —
self-host freely; do not resell as a managed service.
2026-04-14 09:18:18 +00:00

276 lines
10 KiB
Python

"""Tests for sitemap URL discovery — CMP-21."""
from unittest.mock import AsyncMock, patch
import httpx
import pytest
from src.sitemap import _fetch_sitemap, _find_sitemap_in_robots, discover_urls
# ── Helpers ─────────────────────────────────────────────────────────────
def _make_response(status_code: int = 200, text: str = "") -> httpx.Response:
"""Build a fake httpx.Response."""
return httpx.Response(
status_code=status_code, text=text, request=httpx.Request("GET", "http://x")
)
SITEMAP_XML = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
<url><loc>https://example.com/page3</loc></url>
</urlset>
"""
SITEMAP_INDEX_XML = """\
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://example.com/sitemap-main.xml</loc></sitemap>
<sitemap><loc>https://example.com/sitemap-blog.xml</loc></sitemap>
</sitemapindex>
"""
CHILD_SITEMAP_XML = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/blog/post1</loc></url>
<url><loc>https://example.com/blog/post2</loc></url>
</urlset>
"""
ROBOTS_TXT_WITH_SITEMAP = """\
User-agent: *
Disallow: /admin/
Sitemap: https://example.com/custom-sitemap.xml
"""
ROBOTS_TXT_NO_SITEMAP = """\
User-agent: *
Disallow: /admin/
"""
# ── _fetch_sitemap ─────────────────────────────────────────────────────
class TestFetchSitemap:
@pytest.mark.asyncio(loop_scope="session")
async def test_parses_regular_sitemap(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
]
@pytest.mark.asyncio(loop_scope="session")
async def test_respects_max_urls(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 2)
assert len(urls) == 2
@pytest.mark.asyncio(loop_scope="session")
async def test_handles_sitemap_index(self):
"""Sitemap index should recursively fetch child sitemaps."""
responses = {
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
}
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=lambda url: responses[url])
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert len(urls) == 5
assert "https://example.com/page1" in urls
assert "https://example.com/blog/post1" in urls
@pytest.mark.asyncio(loop_scope="session")
async def test_sitemap_index_respects_max_urls(self):
"""Should stop fetching child sitemaps once max_urls is reached."""
responses = {
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
}
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=lambda url: responses[url])
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 3)
assert len(urls) == 3
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_empty_on_404(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(404))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_empty_on_invalid_xml(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, "not xml at all"))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_empty_on_network_error(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
@pytest.mark.asyncio(loop_scope="session")
async def test_empty_urlset(self):
empty_sitemap = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>
"""
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, empty_sitemap))
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
assert urls == []
# ── _find_sitemap_in_robots ────────────────────────────────────────────
class TestFindSitemapInRobots:
@pytest.mark.asyncio(loop_scope="session")
async def test_finds_sitemap_directive(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_WITH_SITEMAP))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url == "https://example.com/custom-sitemap.xml"
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_none_when_no_directive(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_NO_SITEMAP))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url is None
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_none_on_404(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(404))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url is None
@pytest.mark.asyncio(loop_scope="session")
async def test_returns_none_on_network_error(self):
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url is None
@pytest.mark.asyncio(loop_scope="session")
async def test_case_insensitive_directive(self):
robots = "User-agent: *\nsITEMAP: https://example.com/sm.xml\n"
client = AsyncMock(spec=httpx.AsyncClient)
client.get = AsyncMock(return_value=_make_response(200, robots))
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
assert url == "https://example.com/sm.xml"
# ── discover_urls ──────────────────────────────────────────────────────
class TestDiscoverUrls:
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_returns_sitemap_urls(self, mock_robots, mock_sitemap):
"""Should return URLs from /sitemap.xml when available."""
mock_sitemap.return_value = [
"https://example.com/page1",
"https://example.com/page2",
]
urls = await discover_urls("example.com")
assert urls == ["https://example.com/page1", "https://example.com/page2"]
mock_robots.assert_not_called()
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_falls_back_to_robots_txt(self, mock_robots, mock_sitemap):
"""When sitemap.xml returns nothing, should try robots.txt."""
mock_sitemap.side_effect = [[], ["https://example.com/from-robots"]]
mock_robots.return_value = "https://example.com/alt-sitemap.xml"
urls = await discover_urls("example.com")
assert urls == ["https://example.com/from-robots"]
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_falls_back_to_default_paths(self, mock_robots, mock_sitemap):
"""When no sitemap exists, should return default paths."""
mock_sitemap.return_value = []
mock_robots.return_value = None
urls = await discover_urls("example.com")
assert "https://example.com/" in urls
assert "https://example.com/privacy" in urls
assert "https://example.com/cookie-policy" in urls
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_respects_max_urls(self, mock_robots, mock_sitemap):
many_urls = [f"https://example.com/page{i}" for i in range(100)]
mock_sitemap.return_value = many_urls
urls = await discover_urls("example.com", max_urls=5)
assert len(urls) == 5
@pytest.mark.asyncio(loop_scope="session")
@patch("src.sitemap._fetch_sitemap")
@patch("src.sitemap._find_sitemap_in_robots")
async def test_default_paths_respect_max_urls(self, mock_robots, mock_sitemap):
mock_sitemap.return_value = []
mock_robots.return_value = None
urls = await discover_urls("example.com", max_urls=3)
assert len(urls) == 3