gabriel / musehub public
test_musehub_sitemap.py python
332 lines 11.6 KB
cd448303 Initial extraction of MuseHub from maestro monorepo. Gabriel Cardona <gabriel@tellurstori.com> 7d ago
1 """Tests for the MuseHub sitemap.xml and robots.txt endpoints.
2
3 Covers acceptance criteria:
4 - test_sitemap_returns_xml — GET /sitemap.xml returns 200 with XML content-type
5 - test_sitemap_contains_static_pages — static explore/trending/topics URLs are always present
6 - test_sitemap_contains_public_repo — a seeded public repo appears in the sitemap
7 - test_sitemap_excludes_private_repo — private repos do NOT appear in the sitemap
8 - test_sitemap_contains_user_profile — seeded user profile URL appears in sitemap
9 - test_sitemap_contains_topic_urls — repo tags generate /topics/{tag} entries
10 - test_sitemap_contains_release_url — a release URL appears for repos with releases
11 - test_sitemap_xml_well_formed — sitemap can be parsed as valid XML
12 - test_sitemap_loc_uses_request_host — loc entries use the base URL from the request
13 - test_robots_txt_returns_plain_text — GET /robots.txt returns 200 text/plain
14 - test_robots_txt_allows_musehub_ui — Allow: /musehub/ui/ is present
15 - test_robots_txt_disallows_settings — settings path is disallowed
16 - test_robots_txt_disallows_notifications — notifications path is disallowed
17 - test_robots_txt_disallows_api — /api/ directory is disallowed
18 - test_robots_txt_contains_sitemap_url — Sitemap: directive points to /sitemap.xml
19 - test_robots_txt_names_known_agents — known AI bots appear with explicit Allow
20 - test_robots_txt_no_auth_required — endpoint is accessible without JWT
21 - test_sitemap_no_auth_required — sitemap is accessible without JWT
22 """
23 from __future__ import annotations
24
25 import pytest
26 from httpx import AsyncClient
27 from sqlalchemy.ext.asyncio import AsyncSession
28 from xml.etree import ElementTree as ET
29
30 from musehub.db.musehub_models import (
31 MusehubProfile,
32 MusehubRelease,
33 MusehubRepo,
34 )
35
36
37 # ---------------------------------------------------------------------------
38 # Helpers
39 # ---------------------------------------------------------------------------
40
41
42 async def _make_public_repo(
43 db_session: AsyncSession,
44 *,
45 owner: str = "sitemap-user",
46 slug: str = "sitemap-repo",
47 tags: list[str] | None = None,
48 visibility: str = "public",
49 ) -> MusehubRepo:
50 """Seed a repo and return the ORM object."""
51 repo = MusehubRepo(
52 name=slug,
53 owner=owner,
54 slug=slug,
55 visibility=visibility,
56 owner_user_id="sitemap-user-id",
57 description="test repo for sitemap",
58 tags=tags or [],
59 )
60 db_session.add(repo)
61 await db_session.commit()
62 await db_session.refresh(repo)
63 return repo
64
65
66 async def _make_profile(
67 db_session: AsyncSession,
68 *,
69 username: str = "sitemap-user",
70 user_id: str = "sitemap-user-id",
71 ) -> MusehubProfile:
72 """Seed a user profile and return the ORM object."""
73 profile = MusehubProfile(
74 user_id=user_id,
75 username=username,
76 )
77 db_session.add(profile)
78 await db_session.commit()
79 await db_session.refresh(profile)
80 return profile
81
82
83 async def _make_release(
84 db_session: AsyncSession,
85 repo_id: str,
86 *,
87 tag: str = "v1.0",
88 ) -> MusehubRelease:
89 """Seed a release and return the ORM object."""
90 release = MusehubRelease(
91 repo_id=repo_id,
92 tag=tag,
93 title=f"Release {tag}",
94 body="",
95 author="sitemap-user",
96 )
97 db_session.add(release)
98 await db_session.commit()
99 await db_session.refresh(release)
100 return release
101
102
103 # ---------------------------------------------------------------------------
104 # Sitemap tests
105 # ---------------------------------------------------------------------------
106
107
108 @pytest.mark.anyio
109 async def test_sitemap_returns_xml(client: AsyncClient, db_session: AsyncSession) -> None:
110 """GET /sitemap.xml returns 200 with an XML content-type."""
111 response = await client.get("/sitemap.xml")
112 assert response.status_code == 200
113 assert "xml" in response.headers["content-type"]
114
115
116 @pytest.mark.anyio
117 async def test_sitemap_contains_static_pages(
118 client: AsyncClient, db_session: AsyncSession
119 ) -> None:
120 """Static explore, trending, and topics pages are always included in the sitemap."""
121 response = await client.get("/sitemap.xml")
122 assert response.status_code == 200
123 body = response.text
124 assert "/musehub/ui/explore" in body
125 assert "/musehub/ui/trending" in body
126 assert "/musehub/ui/topics" in body
127
128
129 @pytest.mark.anyio
130 async def test_sitemap_contains_public_repo(
131 client: AsyncClient, db_session: AsyncSession
132 ) -> None:
133 """A seeded public repo's UI URL appears in the sitemap."""
134 await _make_public_repo(db_session, owner="artist", slug="cool-track")
135 response = await client.get("/sitemap.xml")
136 assert response.status_code == 200
137 body = response.text
138 assert "/musehub/ui/artist/cool-track" in body
139
140
141 @pytest.mark.anyio
142 async def test_sitemap_excludes_private_repo(
143 client: AsyncClient, db_session: AsyncSession
144 ) -> None:
145 """Private repos must not appear anywhere in the sitemap."""
146 await _make_public_repo(db_session, owner="secretuser", slug="hidden-project", visibility="private")
147 response = await client.get("/sitemap.xml")
148 assert response.status_code == 200
149 body = response.text
150 assert "hidden-project" not in body
151 assert "secretuser" not in body
152
153
154 @pytest.mark.anyio
155 async def test_sitemap_contains_user_profile(
156 client: AsyncClient, db_session: AsyncSession
157 ) -> None:
158 """A seeded user profile generates a /musehub/ui/users/{username} entry."""
159 await _make_profile(db_session, username="jazzmaster", user_id="jazzmaster-uid")
160 response = await client.get("/sitemap.xml")
161 assert response.status_code == 200
162 assert "/musehub/ui/users/jazzmaster" in response.text
163
164
165 @pytest.mark.anyio
166 async def test_sitemap_contains_topic_urls(
167 client: AsyncClient, db_session: AsyncSession
168 ) -> None:
169 """Tags on public repos generate /musehub/ui/topics/{tag} entries."""
170 await _make_public_repo(db_session, owner="producer", slug="beats", tags=["lo-fi", "jazz"])
171 response = await client.get("/sitemap.xml")
172 assert response.status_code == 200
173 body = response.text
174 assert "/musehub/ui/topics/lo-fi" in body
175 assert "/musehub/ui/topics/jazz" in body
176
177
178 @pytest.mark.anyio
179 async def test_sitemap_contains_release_url(
180 client: AsyncClient, db_session: AsyncSession
181 ) -> None:
182 """A release on a public repo generates a /releases/{tag} sitemap entry."""
183 repo = await _make_public_repo(db_session, owner="bandname", slug="debut-album")
184 await _make_release(db_session, repo.repo_id, tag="v1.0")
185 response = await client.get("/sitemap.xml")
186 assert response.status_code == 200
187 assert "/musehub/ui/bandname/debut-album/releases/v1.0" in response.text
188
189
190 @pytest.mark.anyio
191 async def test_sitemap_xml_well_formed(
192 client: AsyncClient, db_session: AsyncSession
193 ) -> None:
194 """The sitemap response must be parseable as valid XML."""
195 response = await client.get("/sitemap.xml")
196 assert response.status_code == 200
197 # This raises if the document is not well-formed XML.
198 root = ET.fromstring(response.content)
199 assert root.tag.endswith("urlset")
200
201
202 @pytest.mark.anyio
203 async def test_sitemap_loc_uses_request_host(
204 client: AsyncClient, db_session: AsyncSession
205 ) -> None:
206 """loc entries in the sitemap use the base URL from the incoming request."""
207 await _make_public_repo(db_session, owner="testowner", slug="testrepo")
208 response = await client.get("/sitemap.xml")
209 assert response.status_code == 200
210 # The test client uses base_url="http://test" — every loc must start with http://test.
211 body = response.text
212 assert "<loc>http://test" in body
213
214
215 @pytest.mark.anyio
216 async def test_sitemap_no_auth_required(
217 client: AsyncClient, db_session: AsyncSession
218 ) -> None:
219 """Sitemap endpoint must be accessible without a JWT (crawlers don't authenticate)."""
220 response = await client.get("/sitemap.xml")
221 assert response.status_code != 401
222 assert response.status_code == 200
223
224
225 @pytest.mark.anyio
226 async def test_sitemap_repo_commits_page_included(
227 client: AsyncClient, db_session: AsyncSession
228 ) -> None:
229 """Each public repo's /commits page also appears in the sitemap."""
230 await _make_public_repo(db_session, owner="composer", slug="symphony-no1")
231 response = await client.get("/sitemap.xml")
232 assert response.status_code == 200
233 assert "/musehub/ui/composer/symphony-no1/commits" in response.text
234
235
236 @pytest.mark.anyio
237 async def test_sitemap_repo_issues_page_included(
238 client: AsyncClient, db_session: AsyncSession
239 ) -> None:
240 """Each public repo's /issues page also appears in the sitemap."""
241 await _make_public_repo(db_session, owner="composer", slug="symphony-no2")
242 response = await client.get("/sitemap.xml")
243 assert response.status_code == 200
244 assert "/musehub/ui/composer/symphony-no2/issues" in response.text
245
246
247 # ---------------------------------------------------------------------------
248 # Robots.txt tests
249 # ---------------------------------------------------------------------------
250
251
252 @pytest.mark.anyio
253 async def test_robots_txt_returns_plain_text(
254 client: AsyncClient, db_session: AsyncSession
255 ) -> None:
256 """GET /robots.txt returns 200 with text/plain content-type."""
257 response = await client.get("/robots.txt")
258 assert response.status_code == 200
259 assert "text/plain" in response.headers["content-type"]
260
261
262 @pytest.mark.anyio
263 async def test_robots_txt_allows_musehub_ui(
264 client: AsyncClient, db_session: AsyncSession
265 ) -> None:
266 """Allow: /musehub/ui/ is present for all crawlers."""
267 response = await client.get("/robots.txt")
268 assert response.status_code == 200
269 assert "Allow: /musehub/ui/" in response.text
270
271
272 @pytest.mark.anyio
273 async def test_robots_txt_disallows_settings(
274 client: AsyncClient, db_session: AsyncSession
275 ) -> None:
276 """Settings paths are disallowed to prevent indexing of private user config pages."""
277 response = await client.get("/robots.txt")
278 assert response.status_code == 200
279 assert "Disallow: /musehub/ui/*/settings" in response.text
280
281
282 @pytest.mark.anyio
283 async def test_robots_txt_disallows_notifications(
284 client: AsyncClient, db_session: AsyncSession
285 ) -> None:
286 """Notification pages are disallowed (user-private inbox content)."""
287 response = await client.get("/robots.txt")
288 assert response.status_code == 200
289 assert "Disallow: /musehub/ui/notifications" in response.text
290
291
292 @pytest.mark.anyio
293 async def test_robots_txt_disallows_api(
294 client: AsyncClient, db_session: AsyncSession
295 ) -> None:
296 """API paths are disallowed — crawlers should use the sitemap, not the REST API."""
297 response = await client.get("/robots.txt")
298 assert response.status_code == 200
299 assert "Disallow: /api/" in response.text
300
301
302 @pytest.mark.anyio
303 async def test_robots_txt_contains_sitemap_url(
304 client: AsyncClient, db_session: AsyncSession
305 ) -> None:
306 """Sitemap: directive is present and points to /sitemap.xml."""
307 response = await client.get("/robots.txt")
308 assert response.status_code == 200
309 assert "Sitemap:" in response.text
310 assert "sitemap.xml" in response.text
311
312
313 @pytest.mark.anyio
314 async def test_robots_txt_names_known_agents(
315 client: AsyncClient, db_session: AsyncSession
316 ) -> None:
317 """Known AI discovery bots (GPTBot, ClaudeBot, etc.) appear with explicit Allow."""
318 response = await client.get("/robots.txt")
319 assert response.status_code == 200
320 body = response.text
321 for bot in ("GPTBot", "ClaudeBot", "Googlebot", "CursorBot"):
322 assert bot in body
323
324
325 @pytest.mark.anyio
326 async def test_robots_txt_no_auth_required(
327 client: AsyncClient, db_session: AsyncSession
328 ) -> None:
329 """robots.txt must be accessible without authentication."""
330 response = await client.get("/robots.txt")
331 assert response.status_code != 401
332 assert response.status_code == 200