"""Tests for sitemap URL discovery — CMP-21.""" from unittest.mock import AsyncMock, patch import httpx import pytest from src.sitemap import _fetch_sitemap, _find_sitemap_in_robots, discover_urls # ── Helpers ───────────────────────────────────────────────────────────── def _make_response(status_code: int = 200, text: str = "") -> httpx.Response: """Build a fake httpx.Response.""" return httpx.Response( status_code=status_code, text=text, request=httpx.Request("GET", "http://x") ) SITEMAP_XML = """\ https://example.com/page1 https://example.com/page2 https://example.com/page3 """ SITEMAP_INDEX_XML = """\ https://example.com/sitemap-main.xml https://example.com/sitemap-blog.xml """ CHILD_SITEMAP_XML = """\ https://example.com/blog/post1 https://example.com/blog/post2 """ ROBOTS_TXT_WITH_SITEMAP = """\ User-agent: * Disallow: /admin/ Sitemap: https://example.com/custom-sitemap.xml """ ROBOTS_TXT_NO_SITEMAP = """\ User-agent: * Disallow: /admin/ """ # ── _fetch_sitemap ───────────────────────────────────────────────────── class TestFetchSitemap: @pytest.mark.asyncio(loop_scope="session") async def test_parses_regular_sitemap(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML)) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50) assert urls == [ "https://example.com/page1", "https://example.com/page2", "https://example.com/page3", ] @pytest.mark.asyncio(loop_scope="session") async def test_respects_max_urls(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, SITEMAP_XML)) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 2) assert len(urls) == 2 @pytest.mark.asyncio(loop_scope="session") async def test_handles_sitemap_index(self): """Sitemap index should recursively fetch child sitemaps.""" responses = { "https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML), "https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML), "https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML), } client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(side_effect=lambda url: responses[url]) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50) assert len(urls) == 5 assert "https://example.com/page1" in urls assert "https://example.com/blog/post1" in urls @pytest.mark.asyncio(loop_scope="session") async def test_sitemap_index_respects_max_urls(self): """Should stop fetching child sitemaps once max_urls is reached.""" responses = { "https://example.com/sitemap.xml": _make_response(200, SITEMAP_INDEX_XML), "https://example.com/sitemap-main.xml": _make_response(200, SITEMAP_XML), "https://example.com/sitemap-blog.xml": _make_response(200, CHILD_SITEMAP_XML), } client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(side_effect=lambda url: responses[url]) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 3) assert len(urls) == 3 @pytest.mark.asyncio(loop_scope="session") async def test_returns_empty_on_404(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(404)) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50) assert urls == [] @pytest.mark.asyncio(loop_scope="session") async def test_returns_empty_on_invalid_xml(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, "not xml at all")) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50) assert urls == [] @pytest.mark.asyncio(loop_scope="session") async def test_returns_empty_on_network_error(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused")) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50) assert urls == [] @pytest.mark.asyncio(loop_scope="session") async def test_empty_urlset(self): empty_sitemap = """\ """ client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, empty_sitemap)) urls = await _fetch_sitemap(client, "https://example.com/sitemap.xml", 50) assert urls == [] # ── _find_sitemap_in_robots ──────────────────────────────────────────── class TestFindSitemapInRobots: @pytest.mark.asyncio(loop_scope="session") async def test_finds_sitemap_directive(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_WITH_SITEMAP)) url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt") assert url == "https://example.com/custom-sitemap.xml" @pytest.mark.asyncio(loop_scope="session") async def test_returns_none_when_no_directive(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, ROBOTS_TXT_NO_SITEMAP)) url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt") assert url is None @pytest.mark.asyncio(loop_scope="session") async def test_returns_none_on_404(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(404)) url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt") assert url is None @pytest.mark.asyncio(loop_scope="session") async def test_returns_none_on_network_error(self): client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(side_effect=httpx.ConnectError("Connection refused")) url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt") assert url is None @pytest.mark.asyncio(loop_scope="session") async def test_case_insensitive_directive(self): robots = "User-agent: *\nsITEMAP: https://example.com/sm.xml\n" client = AsyncMock(spec=httpx.AsyncClient) client.get = AsyncMock(return_value=_make_response(200, robots)) url = await _find_sitemap_in_robots(client, "https://example.com/robots.txt") assert url == "https://example.com/sm.xml" # ── discover_urls ────────────────────────────────────────────────────── class TestDiscoverUrls: @pytest.mark.asyncio(loop_scope="session") @patch("src.sitemap._fetch_sitemap") @patch("src.sitemap._find_sitemap_in_robots") async def test_returns_sitemap_urls(self, mock_robots, mock_sitemap): """Should return URLs from /sitemap.xml when available.""" mock_sitemap.return_value = [ "https://example.com/page1", "https://example.com/page2", ] urls = await discover_urls("example.com") assert urls == ["https://example.com/page1", "https://example.com/page2"] mock_robots.assert_not_called() @pytest.mark.asyncio(loop_scope="session") @patch("src.sitemap._fetch_sitemap") @patch("src.sitemap._find_sitemap_in_robots") async def test_falls_back_to_robots_txt(self, mock_robots, mock_sitemap): """When sitemap.xml returns nothing, should try robots.txt.""" mock_sitemap.side_effect = [[], ["https://example.com/from-robots"]] mock_robots.return_value = "https://example.com/alt-sitemap.xml" urls = await discover_urls("example.com") assert urls == ["https://example.com/from-robots"] @pytest.mark.asyncio(loop_scope="session") @patch("src.sitemap._fetch_sitemap") @patch("src.sitemap._find_sitemap_in_robots") async def test_falls_back_to_default_paths(self, mock_robots, mock_sitemap): """When no sitemap exists, should return default paths.""" mock_sitemap.return_value = [] mock_robots.return_value = None urls = await discover_urls("example.com") assert "https://example.com/" in urls assert "https://example.com/privacy" in urls assert "https://example.com/cookie-policy" in urls @pytest.mark.asyncio(loop_scope="session") @patch("src.sitemap._fetch_sitemap") @patch("src.sitemap._find_sitemap_in_robots") async def test_respects_max_urls(self, mock_robots, mock_sitemap): many_urls = [f"https://example.com/page{i}" for i in range(100)] mock_sitemap.return_value = many_urls urls = await discover_urls("example.com", max_urls=5) assert len(urls) == 5 @pytest.mark.asyncio(loop_scope="session") @patch("src.sitemap._fetch_sitemap") @patch("src.sitemap._find_sitemap_in_robots") async def test_default_paths_respect_max_urls(self, mock_robots, mock_sitemap): mock_sitemap.return_value = [] mock_robots.return_value = None urls = await discover_urls("example.com", max_urls=3) assert len(urls) == 3