ConsentOS — a privacy-first cookie consent management platform. Self-hosted, source-available alternative to OneTrust, Cookiebot, and CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant architecture with role-based access, configuration cascade (system → org → group → site → region), dark-pattern detection in the scanner, and a tamper-evident consent record audit trail. This is the initial public release. Prior development history is retained internally. See README.md for the feature list, architecture overview, and quick-start instructions. Licensed under the Elastic Licence 2.0 — self-host freely; do not resell as a managed service.
276 lines
10 KiB
Python
276 lines
10 KiB
Python
"""Tests for sitemap URL discovery — CMP-21."""
|
|
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
from src.sitemap import _fetch_sitemap, _find_sitemap_in_robots, discover_urls
|
|
|
|
# ── Helpers ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _make_response(status_code: int = 200, text: str = "") -> httpx.Response:
|
|
"""Build a fake httpx.Response."""
|
|
return httpx.Response(
|
|
status_code=status_code, text=text, request=httpx.Request("GET", "http://x")
|
|
)
|
|
|
|
|
|
SITEMAP_XML = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://example.com/page1</loc></url>
|
|
<url><loc>https://example.com/page2</loc></url>
|
|
<url><loc>https://example.com/page3</loc></url>
|
|
</urlset>
|
|
"""
|
|
|
|
SITEMAP_INDEX_XML = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<sitemap><loc>https://example.com/sitemap-main.xml</loc></sitemap>
|
|
<sitemap><loc>https://example.com/sitemap-blog.xml</loc></sitemap>
|
|
</sitemapindex>
|
|
"""
|
|
|
|
CHILD_SITEMAP_XML = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://example.com/blog/post1</loc></url>
|
|
<url><loc>https://example.com/blog/post2</loc></url>
|
|
</urlset>
|
|
"""
|
|
|
|
ROBOTS_TXT_WITH_SITEMAP = """\
|
|
User-agent: *
|
|
Disallow: /admin/
|
|
Sitemap: https://example.com/custom-sitemap.xml
|
|
"""
|
|
|
|
ROBOTS_TXT_NO_SITEMAP = """\
|
|
User-agent: *
|
|
Disallow: /admin/
|
|
"""
|
|
|
|
|
|
# ── _fetch_sitemap ─────────────────────────────────────────────────────
|
|
|
|
|
|
class TestFetchSitemap:
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_parses_regular_sitemap(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
|
|
|
assert urls == [
|
|
"https://example.com/page1",
|
|
"https://example.com/page2",
|
|
"https://example.com/page3",
|
|
]
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_respects_max_urls(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML))
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 2)
|
|
|
|
assert len(urls) == 2
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_handles_sitemap_index(self):
|
|
"""Sitemap index should recursively fetch child sitemaps."""
|
|
responses = {
|
|
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
|
|
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
|
|
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
|
|
}
|
|
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(side_effect=lambda url: responses[url])
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
|
|
|
assert len(urls) == 5
|
|
assert "https://example.com/page1" in urls
|
|
assert "https://example.com/blog/post1" in urls
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_sitemap_index_respects_max_urls(self):
|
|
"""Should stop fetching child sitemaps once max_urls is reached."""
|
|
responses = {
|
|
"https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML),
|
|
"https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML),
|
|
"https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML),
|
|
}
|
|
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(side_effect=lambda url: responses[url])
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 3)
|
|
|
|
assert len(urls) == 3
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_returns_empty_on_404(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(404))
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
|
|
|
assert urls == []
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_returns_empty_on_invalid_xml(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, "not xml at all"))
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
|
|
|
assert urls == []
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_returns_empty_on_network_error(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
|
|
|
assert urls == []
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_empty_urlset(self):
|
|
empty_sitemap = """\
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
</urlset>
|
|
"""
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, empty_sitemap))
|
|
|
|
urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50)
|
|
|
|
assert urls == []
|
|
|
|
|
|
# ── _find_sitemap_in_robots ────────────────────────────────────────────
|
|
|
|
|
|
class TestFindSitemapInRobots:
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_finds_sitemap_directive(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_WITH_SITEMAP))
|
|
|
|
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
|
|
|
assert url == "https://example.com/custom-sitemap.xml"
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_returns_none_when_no_directive(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_NO_SITEMAP))
|
|
|
|
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
|
|
|
assert url is None
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_returns_none_on_404(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(404))
|
|
|
|
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
|
|
|
assert url is None
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_returns_none_on_network_error(self):
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
|
|
|
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
|
|
|
assert url is None
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_case_insensitive_directive(self):
|
|
robots = "User-agent: *\nsITEMAP: https://example.com/sm.xml\n"
|
|
client = AsyncMock(spec=httpx.AsyncClient)
|
|
client.get = AsyncMock(return_value=_make_response(200, robots))
|
|
|
|
url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt")
|
|
|
|
assert url == "https://example.com/sm.xml"
|
|
|
|
|
|
# ── discover_urls ──────────────────────────────────────────────────────
|
|
|
|
|
|
class TestDiscoverUrls:
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
@patch("src.sitemap._fetch_sitemap")
|
|
@patch("src.sitemap._find_sitemap_in_robots")
|
|
async def test_returns_sitemap_urls(self, mock_robots, mock_sitemap):
|
|
"""Should return URLs from /sitemap.xml when available."""
|
|
mock_sitemap.return_value = [
|
|
"https://example.com/page1",
|
|
"https://example.com/page2",
|
|
]
|
|
|
|
urls = await discover_urls("example.com")
|
|
|
|
assert urls == ["https://example.com/page1", "https://example.com/page2"]
|
|
mock_robots.assert_not_called()
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
@patch("src.sitemap._fetch_sitemap")
|
|
@patch("src.sitemap._find_sitemap_in_robots")
|
|
async def test_falls_back_to_robots_txt(self, mock_robots, mock_sitemap):
|
|
"""When sitemap.xml returns nothing, should try robots.txt."""
|
|
mock_sitemap.side_effect = [[], ["https://example.com/from-robots"]]
|
|
mock_robots.return_value = "https://example.com/alt-sitemap.xml"
|
|
|
|
urls = await discover_urls("example.com")
|
|
|
|
assert urls == ["https://example.com/from-robots"]
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
@patch("src.sitemap._fetch_sitemap")
|
|
@patch("src.sitemap._find_sitemap_in_robots")
|
|
async def test_falls_back_to_default_paths(self, mock_robots, mock_sitemap):
|
|
"""When no sitemap exists, should return default paths."""
|
|
mock_sitemap.return_value = []
|
|
mock_robots.return_value = None
|
|
|
|
urls = await discover_urls("example.com")
|
|
|
|
assert "https://example.com/" in urls
|
|
assert "https://example.com/privacy" in urls
|
|
assert "https://example.com/cookie-policy" in urls
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
@patch("src.sitemap._fetch_sitemap")
|
|
@patch("src.sitemap._find_sitemap_in_robots")
|
|
async def test_respects_max_urls(self, mock_robots, mock_sitemap):
|
|
many_urls = [f"https://example.com/page{i}" for i in range(100)]
|
|
mock_sitemap.return_value = many_urls
|
|
|
|
urls = await discover_urls("example.com", max_urls=5)
|
|
|
|
assert len(urls) == 5
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
@patch("src.sitemap._fetch_sitemap")
|
|
@patch("src.sitemap._find_sitemap_in_robots")
|
|
async def test_default_paths_respect_max_urls(self, mock_robots, mock_sitemap):
|
|
mock_sitemap.return_value = []
|
|
mock_robots.return_value = None
|
|
|
|
urls = await discover_urls("example.com", max_urls=3)
|
|
|
|
assert len(urls) == 3
|