gabriel / muse public
test_markdown_adapter.py python
724 lines 27.4 KB
9d49af7a feat: semantic TOML and Markdown adapters for the code domain plugin Gabriel Cardona <gabriel@tellurstori.com> 1d ago
1 """Tests for the rewritten MarkdownAdapter.
2
3 Coverage:
4 - Extension routing: only .md / .rst / .txt are accepted.
5 - Section symbols: flat headings, hierarchical qualified names, level encoding.
6 - Content-ID correctness: full section bytes hashed, not just heading text.
7 - Body-hash / signature split: retitle detection, level-change detection.
8 - Code block symbols: language tag, no-language fallback, content hash.
9 - GFM table symbols: header signature, data-row body_hash, schema changes.
10 - Inline markup stripping: bold, italic, inline-code, links in headings.
11 - Deduplication: identical sibling headings get @L{lineno} suffix.
12 - Depth limit: sections beyond _MAX_DEPTH are silently dropped.
13 - Edge cases: empty file, no headings, setext headings (unsupported → skip).
14 - Real-world shape: README-shaped document exercises all three emitters.
15 - _plain_heading unit tests: images dropped, markup stripped, truncation.
16 """
17
18 from __future__ import annotations
19
20 import pytest
21 from muse.plugins.code.ast_parser import (
22 MarkdownAdapter,
23 SymbolRecord,
24 SymbolTree,
25 _plain_heading,
26 )
27
28
29 # ---------------------------------------------------------------------------
30 # Helpers
31 # ---------------------------------------------------------------------------
32
33 def _parse(source: str, path: str = "README.md") -> SymbolTree:
34 adapter = MarkdownAdapter()
35 if adapter._parser is None:
36 pytest.skip("tree-sitter-markdown not available")
37 return adapter.parse_symbols(source.encode(), path)
38
39
40 # ---------------------------------------------------------------------------
41 # _plain_heading unit tests
42 # ---------------------------------------------------------------------------
43
44 class TestPlainHeading:
45 def test_plain_text_unchanged(self) -> None:
46 assert _plain_heading("Hello World") == "Hello World"
47
48 def test_bold_stripped(self) -> None:
49 assert _plain_heading("**Bold** heading") == "Bold heading"
50
51 def test_italic_star_stripped(self) -> None:
52 assert _plain_heading("*italic* text") == "italic text"
53
54 def test_bold_italic_combined(self) -> None:
55 assert _plain_heading("***bold italic***") == "bold italic"
56
57 def test_italic_underscore_stripped(self) -> None:
58 assert _plain_heading("_italic_") == "italic"
59
60 def test_bold_underscore_stripped(self) -> None:
61 assert _plain_heading("__bold__") == "bold"
62
63 def test_inline_code_stripped(self) -> None:
64 assert _plain_heading("`code` block") == "code block"
65
66 def test_triple_backtick_stripped(self) -> None:
67 assert _plain_heading("```code```") == "code"
68
69 def test_link_keeps_text(self) -> None:
70 assert _plain_heading("[link text](https://example.com)") == "link text"
71
72 def test_reference_link_keeps_text(self) -> None:
73 assert _plain_heading("[link text][ref]") == "link text"
74
75 def test_image_dropped_entirely(self) -> None:
76 assert _plain_heading("![alt text](img.png) caption") == "caption"
77
78 def test_reference_image_dropped(self) -> None:
79 assert _plain_heading("![alt][ref] caption") == "caption"
80
81 def test_html_entity_amp(self) -> None:
82 assert _plain_heading("foo &amp; bar") == "foo & bar"
83
84 def test_html_entity_lt_gt(self) -> None:
85 assert _plain_heading("a &lt; b &gt; c") == "a < b > c"
86
87 def test_html_entity_quot(self) -> None:
88 assert _plain_heading("say &quot;hi&quot;") == 'say "hi"'
89
90 def test_html_entity_apos(self) -> None:
91 assert _plain_heading("it&#39;s") == "it's"
92
93 def test_whitespace_collapsed(self) -> None:
94 assert _plain_heading(" too many spaces ") == "too many spaces"
95
96 def test_truncation_at_120_chars(self) -> None:
97 long = "A" * 200
98 result = _plain_heading(long)
99 assert len(result) == 120
100
101 def test_empty_string(self) -> None:
102 assert _plain_heading("") == ""
103
104 def test_mixed_markup(self) -> None:
105 # Realistic heading: "**API** `Reference` Guide"
106 result = _plain_heading("**API** `Reference` Guide")
107 assert result == "API Reference Guide"
108
109
110 # ---------------------------------------------------------------------------
111 # Extension routing
112 # ---------------------------------------------------------------------------
113
114 class TestExtensionRouting:
115 def test_md_supported(self) -> None:
116 adapter = MarkdownAdapter()
117 assert ".md" in adapter.supported_extensions()
118
119 def test_rst_supported(self) -> None:
120 adapter = MarkdownAdapter()
121 assert ".rst" in adapter.supported_extensions()
122
123 def test_txt_supported(self) -> None:
124 adapter = MarkdownAdapter()
125 assert ".txt" in adapter.supported_extensions()
126
127 def test_py_not_supported(self) -> None:
128 adapter = MarkdownAdapter()
129 assert ".py" not in adapter.supported_extensions()
130
131 def test_html_not_supported(self) -> None:
132 adapter = MarkdownAdapter()
133 assert ".html" not in adapter.supported_extensions()
134
135
136 # ---------------------------------------------------------------------------
137 # Section symbols: flat headings
138 # ---------------------------------------------------------------------------
139
140 class TestFlatSections:
141 def test_h1_emitted(self) -> None:
142 syms = _parse("# Hello\n\nContent.\n")
143 keys = list(syms)
144 assert any("Hello" in k for k in keys)
145
146 def test_h1_kind_is_section(self) -> None:
147 syms = _parse("# Hello\n\nContent.\n")
148 rec = next(v for k, v in syms.items() if "Hello" in k)
149 assert rec["kind"] == "section"
150
151 def test_h2_emitted(self) -> None:
152 syms = _parse("## Setup\n\nDo the thing.\n")
153 keys = list(syms)
154 assert any("Setup" in k for k in keys)
155
156 def test_h3_emitted(self) -> None:
157 syms = _parse("### Detail\n\nMore detail.\n")
158 keys = list(syms)
159 assert any("Detail" in k for k in keys)
160
161 def test_address_contains_file_path(self) -> None:
162 syms = _parse("# Hello\n", "docs/guide.md")
163 assert any(k.startswith("docs/guide.md::") for k in syms)
164
165 def test_lineno_is_one_based(self) -> None:
166 syms = _parse("# Hello\n\nContent.\n")
167 rec = next(v for k, v in syms.items() if "Hello" in k)
168 assert rec["lineno"] == 1
169
170 def test_end_lineno_greater_than_lineno(self) -> None:
171 syms = _parse("# Hello\n\nSome content.\n")
172 rec = next(v for k, v in syms.items() if "Hello" in k)
173 assert rec["end_lineno"] >= rec["lineno"]
174
175 def test_name_is_plain_text(self) -> None:
176 syms = _parse("# **Bold** Heading\n\nContent.\n")
177 rec = next(v for k, v in syms.items() if "Bold Heading" in k)
178 assert rec["name"] == "Bold Heading"
179
180
181 # ---------------------------------------------------------------------------
182 # Section symbols: hierarchy
183 # ---------------------------------------------------------------------------
184
185 class TestSectionHierarchy:
186 def test_h2_under_h1_has_qualified_name(self) -> None:
187 src = "# Parent\n\n## Child\n\nText.\n"
188 syms = _parse(src)
189 assert any("Parent.Child" in k for k in syms)
190
191 def test_h3_under_h2_under_h1(self) -> None:
192 src = "# A\n\n## B\n\n### C\n\nText.\n"
193 syms = _parse(src)
194 assert any("A.B.C" in k for k in syms)
195
196 def test_sibling_h2s_are_distinct(self) -> None:
197 src = "# Root\n\n## Alpha\n\nFoo.\n\n## Beta\n\nBar.\n"
198 syms = _parse(src)
199 assert any("Alpha" in k for k in syms)
200 assert any("Beta" in k for k in syms)
201
202 def test_h2_address_does_not_bleed_into_sibling(self) -> None:
203 src = "# Root\n\n## A\n\nFoo.\n\n## B\n\nBar.\n"
204 syms = _parse(src)
205 # "A.B" should NOT appear; B is a sibling, not a child of A.
206 assert not any("A.B" in k for k in syms)
207
208 def test_parent_section_includes_child_in_content_id(self) -> None:
209 src_with_child = "# Parent\n\n## Child\n\nText.\n"
210 src_no_child = "# Parent\n\nText.\n"
211 syms_with = _parse(src_with_child)
212 syms_no = _parse(src_no_child)
213 parent_with = next(v for k, v in syms_with.items() if k.endswith("::Parent"))
214 parent_no = next(v for k, v in syms_no.items() if k.endswith("::Parent"))
215 # Adding a child section changes the parent's content_id.
216 assert parent_with["content_id"] != parent_no["content_id"]
217
218 def test_parallel_h2s_in_separate_h1_sections_dont_collide(self) -> None:
219 src = "# Intro\n\n## Overview\n\nX.\n\n# Usage\n\n## Overview\n\nY.\n"
220 syms = _parse(src)
221 # Two Overview headings exist; they must have different addresses.
222 overview_keys = [k for k in syms if "Overview" in k]
223 assert len(overview_keys) == 2
224 assert overview_keys[0] != overview_keys[1]
225
226
227 # ---------------------------------------------------------------------------
228 # Content-ID correctness — the core bug fix
229 # ---------------------------------------------------------------------------
230
231 class TestContentIDCorrectness:
232 def test_changing_body_changes_content_id(self) -> None:
233 src_a = "# Intro\n\nFirst paragraph.\n"
234 src_b = "# Intro\n\nFirst paragraph changed entirely.\n"
235 a = _parse(src_a)
236 b = _parse(src_b)
237 key_a = next(k for k in a if "Intro" in k)
238 key_b = next(k for k in b if "Intro" in k)
239 assert a[key_a]["content_id"] != b[key_b]["content_id"]
240
241 def test_same_content_produces_same_content_id(self) -> None:
242 src = "# Hello\n\nSame content.\n"
243 a = _parse(src)
244 b = _parse(src)
245 key = next(k for k in a if "Hello" in k)
246 assert a[key]["content_id"] == b[key]["content_id"]
247
248 def test_adding_paragraph_changes_content_id(self) -> None:
249 src_a = "# Section\n\nParagraph one.\n"
250 src_b = "# Section\n\nParagraph one.\n\nParagraph two.\n"
251 a = _parse(src_a)
252 b = _parse(src_b)
253 key_a = next(k for k in a if "Section" in k)
254 key_b = next(k for k in b if "Section" in k)
255 assert a[key_a]["content_id"] != b[key_b]["content_id"]
256
257 def test_heading_retitle_changes_content_id(self) -> None:
258 src_a = "# Old Title\n\nSame body.\n"
259 src_b = "# New Title\n\nSame body.\n"
260 a = _parse(src_a)
261 b = _parse(src_b)
262 # Different addresses (different titles) — both content_ids checked
263 key_a = next(k for k in a if "Old Title" in k)
264 key_b = next(k for k in b if "New Title" in k)
265 # content_id differs because heading text changed.
266 assert a[key_a]["content_id"] != b[key_b]["content_id"]
267
268 def test_retitle_with_same_body_has_same_body_hash(self) -> None:
269 """Retitle detection: body_hash stable, signature_id changes."""
270 src_a = "# Old Title\n\nIdentical body content.\n"
271 src_b = "# New Title\n\nIdentical body content.\n"
272 a = _parse(src_a)
273 b = _parse(src_b)
274 key_a = next(k for k in a if "Old Title" in k)
275 key_b = next(k for k in b if "New Title" in k)
276 # Same body text below heading → same body_hash.
277 assert a[key_a]["body_hash"] == b[key_b]["body_hash"]
278 # Different heading text → different signature_id.
279 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
280
281 def test_level_change_changes_metadata_id(self) -> None:
282 """Promoting a heading level is visible in metadata_id, not body_hash."""
283 src_a = "## Section\n\nBody.\n"
284 src_b = "# Section\n\nBody.\n"
285 a = _parse(src_a)
286 b = _parse(src_b)
287 key_a = next(k for k in a if "Section" in k)
288 key_b = next(k for k in b if "Section" in k)
289 assert a[key_a]["metadata_id"] != b[key_b]["metadata_id"]
290 # Body content is the same, so body_hash should match.
291 assert a[key_a]["body_hash"] == b[key_b]["body_hash"]
292
293 def test_level_change_changes_signature_id(self) -> None:
294 src_a = "## Section\n\nBody.\n"
295 src_b = "# Section\n\nBody.\n"
296 a = _parse(src_a)
297 b = _parse(src_b)
298 key_a = next(k for k in a if "Section" in k)
299 key_b = next(k for k in b if "Section" in k)
300 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
301
302
303 # ---------------------------------------------------------------------------
304 # Fenced code blocks
305 # ---------------------------------------------------------------------------
306
307 class TestCodeBlockSymbols:
308 def test_python_block_emitted(self) -> None:
309 src = "# Section\n\n```python\nprint('hello')\n```\n"
310 syms = _parse(src)
311 assert any("code[python]" in k for k in syms)
312
313 def test_code_block_kind_is_variable(self) -> None:
314 src = "# Section\n\n```python\nprint('hello')\n```\n"
315 syms = _parse(src)
316 rec = next(v for k, v in syms.items() if "code[python]" in k)
317 assert rec["kind"] == "variable"
318
319 def test_no_language_block_emitted(self) -> None:
320 src = "# Section\n\n```\nplain text\n```\n"
321 syms = _parse(src)
322 assert any("code@L" in k for k in syms)
323
324 def test_no_language_not_in_symbol_name(self) -> None:
325 src = "# Section\n\n```\nplain text\n```\n"
326 syms = _parse(src)
327 # Should be code@L... not code[]@L...
328 assert not any("code[]" in k for k in syms)
329
330 def test_code_block_scoped_to_section(self) -> None:
331 src = "# Intro\n\n```python\nx = 1\n```\n"
332 syms = _parse(src)
333 # code block address should contain the parent section name
334 assert any("Intro" in k and "code[python]" in k for k in syms)
335
336 def test_code_content_change_changes_content_id(self) -> None:
337 src_a = "# S\n\n```python\nx = 1\n```\n"
338 src_b = "# S\n\n```python\nx = 2\n```\n"
339 a = _parse(src_a)
340 b = _parse(src_b)
341 key_a = next(k for k in a if "code[python]" in k)
342 key_b = next(k for k in b if "code[python]" in k)
343 assert a[key_a]["content_id"] != b[key_b]["content_id"]
344
345 def test_lang_change_changes_signature_id(self) -> None:
346 src_a = "# S\n\n```python\nx = 1\n```\n"
347 src_b = "# S\n\n```javascript\nx = 1\n```\n"
348 a = _parse(src_a)
349 b = _parse(src_b)
350 key_a = next(k for k in a if "code[python]" in k)
351 key_b = next(k for k in b if "code[javascript]" in k)
352 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
353
354 def test_lang_tag_is_lowercased(self) -> None:
355 src = "# S\n\n```Python\npass\n```\n"
356 syms = _parse(src)
357 # Language tag must be lowercased in the symbol name.
358 assert any("code[python]" in k for k in syms)
359
360 def test_multiple_code_blocks_are_distinct(self) -> None:
361 src = (
362 "# Section\n\n"
363 "```python\nblock_one = 1\n```\n\n"
364 "```python\nblock_two = 2\n```\n"
365 )
366 syms = _parse(src)
367 code_keys = [k for k in syms if "code[python]" in k]
368 assert len(code_keys) == 2
369 assert code_keys[0] != code_keys[1]
370
371 def test_code_block_lineno_populated(self) -> None:
372 src = "# Section\n\n```python\npass\n```\n"
373 syms = _parse(src)
374 rec = next(v for k, v in syms.items() if "code[python]" in k)
375 assert rec["lineno"] > 0
376
377
378 # ---------------------------------------------------------------------------
379 # GFM pipe tables
380 # ---------------------------------------------------------------------------
381
382 class TestTableSymbols:
383 _TABLE_SRC = (
384 "# Section\n\n"
385 "| Name | Value |\n"
386 "| ---- | ----- |\n"
387 "| foo | 1 |\n"
388 "| bar | 2 |\n"
389 )
390
391 def test_table_emitted(self) -> None:
392 syms = _parse(self._TABLE_SRC)
393 assert any("table@L" in k for k in syms)
394
395 def test_table_kind_is_section(self) -> None:
396 syms = _parse(self._TABLE_SRC)
397 rec = next(v for k, v in syms.items() if "table@L" in k)
398 assert rec["kind"] == "section"
399
400 def test_table_scoped_to_section(self) -> None:
401 syms = _parse(self._TABLE_SRC)
402 assert any("Section" in k and "table@L" in k for k in syms)
403
404 def test_adding_data_row_changes_content_id(self) -> None:
405 src_a = (
406 "# S\n\n"
407 "| A | B |\n| - | - |\n| 1 | 2 |\n"
408 )
409 src_b = (
410 "# S\n\n"
411 "| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n"
412 )
413 a = _parse(src_a)
414 b = _parse(src_b)
415 key_a = next(k for k in a if "table@L" in k)
416 key_b = next(k for k in b if "table@L" in k)
417 assert a[key_a]["content_id"] != b[key_b]["content_id"]
418
419 def test_adding_data_row_changes_body_hash(self) -> None:
420 src_a = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n"
421 src_b = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n"
422 a = _parse(src_a)
423 b = _parse(src_b)
424 key_a = next(k for k in a if "table@L" in k)
425 key_b = next(k for k in b if "table@L" in k)
426 assert a[key_a]["body_hash"] != b[key_b]["body_hash"]
427
428 def test_column_rename_changes_signature_id(self) -> None:
429 src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n"
430 src_b = "# S\n\n| Label | Value |\n| ----- | ----- |\n| x | 1 |\n"
431 a = _parse(src_a)
432 b = _parse(src_b)
433 key_a = next(k for k in a if "table@L" in k)
434 key_b = next(k for k in b if "table@L" in k)
435 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
436
437 def test_column_rename_does_not_change_body_hash(self) -> None:
438 """Renaming a column header should change signature_id but not body_hash."""
439 src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n"
440 src_b = "# S\n\n| Label | Value |\n| ------ | ----- |\n| x | 1 |\n"
441 a = _parse(src_a)
442 b = _parse(src_b)
443 key_a = next(k for k in a if "table@L" in k)
444 key_b = next(k for k in b if "table@L" in k)
445 # Data rows are the same → body_hash must be equal.
446 assert a[key_a]["body_hash"] == b[key_b]["body_hash"]
447
448 def test_table_lineno_populated(self) -> None:
449 syms = _parse(self._TABLE_SRC)
450 rec = next(v for k, v in syms.items() if "table@L" in k)
451 assert rec["lineno"] > 0
452
453
454 # ---------------------------------------------------------------------------
455 # Inline markup stripping — address stability
456 # ---------------------------------------------------------------------------
457
458 class TestInlineMarkupStripping:
459 def test_bold_heading_address_matches_plain(self) -> None:
460 src_bold = "# **Setup**\n\nContent.\n"
461 src_plain = "# Setup\n\nContent.\n"
462 syms_bold = _parse(src_bold)
463 syms_plain = _parse(src_plain)
464 # Both should produce a key containing "Setup" (not **Setup**).
465 assert any("Setup" in k for k in syms_bold)
466 assert any("Setup" in k for k in syms_plain)
467 # The qualified name in both should be identical.
468 name_bold = next(v for k, v in syms_bold.items() if "Setup" in k)["name"]
469 name_plain = next(v for k, v in syms_plain.items() if "Setup" in k)["name"]
470 assert name_bold == name_plain
471
472 def test_inline_code_heading_stripped(self) -> None:
473 src = "# `muse init` Command\n\nContent.\n"
474 syms = _parse(src)
475 assert any("muse init Command" in k for k in syms)
476
477 def test_link_heading_keeps_text(self) -> None:
478 src = "# [API Reference](https://example.com/api)\n\nContent.\n"
479 syms = _parse(src)
480 assert any("API Reference" in k for k in syms)
481
482 def test_image_in_heading_dropped(self) -> None:
483 src = "# ![logo](logo.png) Intro\n\nContent.\n"
484 syms = _parse(src)
485 # The logo image should be gone; "Intro" should remain.
486 assert any("Intro" in k for k in syms)
487 assert not any("logo.png" in k for k in syms)
488
489
490 # ---------------------------------------------------------------------------
491 # Deduplication
492 # ---------------------------------------------------------------------------
493
494 class TestDeduplication:
495 def test_two_identical_h2s_get_unique_addresses(self) -> None:
496 src = (
497 "# Root\n\n"
498 "## Examples\n\nFirst.\n\n"
499 "## Examples\n\nSecond.\n"
500 )
501 syms = _parse(src)
502 examples_keys = [k for k in syms if "Examples" in k]
503 assert len(examples_keys) == 2
504 assert examples_keys[0] != examples_keys[1]
505
506 def test_deduplicated_key_contains_lineno(self) -> None:
507 src = (
508 "# Root\n\n"
509 "## Examples\n\nFirst.\n\n"
510 "## Examples\n\nSecond.\n"
511 )
512 syms = _parse(src)
513 examples_keys = [k for k in syms if "Examples" in k]
514 # One of the two keys must have @L appended.
515 assert any("@L" in k for k in examples_keys)
516
517 def test_identical_headings_in_different_parents_not_deduped(self) -> None:
518 src = (
519 "# Alpha\n\n## Notes\n\nFoo.\n\n"
520 "# Beta\n\n## Notes\n\nBar.\n"
521 )
522 syms = _parse(src)
523 notes_keys = [k for k in syms if "Notes" in k]
524 assert len(notes_keys) == 2
525 # Should be Alpha.Notes and Beta.Notes — no @L suffix needed.
526 assert any("Alpha.Notes" in k for k in notes_keys)
527 assert any("Beta.Notes" in k for k in notes_keys)
528
529
530 # ---------------------------------------------------------------------------
531 # Depth limit
532 # ---------------------------------------------------------------------------
533
534 class TestDepthLimit:
535 def test_deep_nesting_does_not_crash(self) -> None:
536 # Build 20 levels of nesting: # A, ## A.B, ### A.B.C, etc.
537 levels = ["#" * i + f" Level{i}\n\nText.\n\n" for i in range(1, 21)]
538 src = "".join(levels)
539 # Should not raise; may return fewer symbols than levels.
540 syms = _parse(src)
541 assert isinstance(syms, dict)
542
543 def test_symbols_within_limit_are_extracted(self) -> None:
544 # Only 3 levels — all should be extracted.
545 src = "# A\n\n## A B\n\n### A B C\n\nText.\n"
546 syms = _parse(src)
547 assert any("A" in k for k in syms)
548
549
550 # ---------------------------------------------------------------------------
551 # Edge cases
552 # ---------------------------------------------------------------------------
553
554 class TestEdgeCases:
555 def test_empty_file_returns_empty(self) -> None:
556 adapter = MarkdownAdapter()
557 if adapter._parser is None:
558 pytest.skip("tree-sitter-markdown not available")
559 result = adapter.parse_symbols(b"", "empty.md")
560 assert result == {}
561
562 def test_no_headings_returns_empty(self) -> None:
563 src = "Just a paragraph with no headings.\n"
564 syms = _parse(src)
565 assert syms == {}
566
567 def test_only_horizontal_rule_returns_empty(self) -> None:
568 src = "---\n"
569 syms = _parse(src)
570 assert syms == {}
571
572 def test_binary_like_content_does_not_crash(self) -> None:
573 adapter = MarkdownAdapter()
574 if adapter._parser is None:
575 pytest.skip("tree-sitter-markdown not available")
576 # Non-UTF-8 bytes should not raise.
577 result = adapter.parse_symbols(b"\xff\xfe# Title\n", "weird.md")
578 assert isinstance(result, dict)
579
580 def test_very_long_heading_truncated_in_name(self) -> None:
581 long_heading = "Word " * 50 # 250 chars
582 src = f"# {long_heading}\n\nContent.\n"
583 syms = _parse(src)
584 assert len(syms) == 1
585 rec = next(iter(syms.values()))
586 # name must be at most 120 chars.
587 assert len(rec["name"]) <= 120
588
589 def test_file_content_id_changes_on_any_change(self) -> None:
590 adapter = MarkdownAdapter()
591 src_a = b"# Hello\n\nWorld.\n"
592 src_b = b"# Hello\n\nWorld. " # trailing space
593 assert adapter.file_content_id(src_a) != adapter.file_content_id(src_b)
594
595 def test_file_content_id_is_hex_sha256(self) -> None:
596 adapter = MarkdownAdapter()
597 cid = adapter.file_content_id(b"# Hello\n")
598 assert len(cid) == 64
599 assert all(c in "0123456789abcdef" for c in cid)
600
601 def test_headings_only_no_body(self) -> None:
602 src = "# Title\n## Subtitle\n"
603 syms = _parse(src)
604 assert any("Title" in k for k in syms)
605
606 def test_code_block_at_root_level(self) -> None:
607 """A code block not inside any section gets a root-level address."""
608 src = "```python\nprint('hi')\n```\n"
609 syms = _parse(src)
610 # Should be emitted even without a parent section.
611 assert any("code[python]" in k for k in syms)
612
613 def test_table_at_root_level(self) -> None:
614 src = "| A | B |\n| - | - |\n| 1 | 2 |\n"
615 syms = _parse(src)
616 assert any("table@L" in k for k in syms)
617
618
619 # ---------------------------------------------------------------------------
620 # Real-world README shape
621 # ---------------------------------------------------------------------------
622
623 class TestRealWorldShape:
624 _README = """\
625 # Muse
626
627 A domain-agnostic version control system.
628
629 ## Installation
630
631 ```bash
632 pip install muse-vcs
633 ```
634
635 ## Usage
636
637 Run `muse init` to initialise a repository.
638
639 ### Commands
640
641 | Command | Description |
642 | -------------- | ------------------------- |
643 | `muse init` | Initialise a repository |
644 | `muse commit` | Record a new snapshot |
645 | `muse log` | Show commit history |
646
647 ## API Reference
648
649 ### `muse.core.snapshot`
650
651 Snapshot hashing and workdir diffing.
652
653 ```python
654 from muse.core import snapshot
655 snap = snapshot.build(root)
656 ```
657
658 ## Contributing
659
660 See CONTRIBUTING.md for guidelines.
661 """
662
663 def test_top_level_sections_extracted(self) -> None:
664 syms = _parse(self._README)
665 top = [k for k in syms if "::" in k]
666 names = [k.split("::")[-1] for k in top]
667 assert "Muse" in names or any("Muse" in n for n in names)
668
669 def test_installation_section_extracted(self) -> None:
670 syms = _parse(self._README)
671 assert any("Installation" in k for k in syms)
672
673 def test_usage_commands_table_extracted(self) -> None:
674 syms = _parse(self._README)
675 assert any("table@L" in k for k in syms)
676
677 def test_bash_code_block_extracted(self) -> None:
678 syms = _parse(self._README)
679 assert any("code[bash]" in k for k in syms)
680
681 def test_python_code_block_extracted(self) -> None:
682 syms = _parse(self._README)
683 assert any("code[python]" in k for k in syms)
684
685 def test_api_reference_subsection_extracted(self) -> None:
686 syms = _parse(self._README)
687 assert any("API Reference" in k for k in syms)
688
689 def test_all_symbol_records_have_required_keys(self) -> None:
690 syms = _parse(self._README)
691 required = {
692 "kind", "name", "qualified_name", "content_id", "body_hash",
693 "signature_id", "metadata_id", "canonical_key", "lineno", "end_lineno",
694 }
695 for addr, rec in syms.items():
696 missing = required - set(rec.keys())
697 assert not missing, f"{addr!r} missing keys: {missing}"
698
699 def test_no_symbol_has_empty_content_id(self) -> None:
700 syms = _parse(self._README)
701 for addr, rec in syms.items():
702 assert rec["content_id"], f"{addr!r} has empty content_id"
703
704 def test_all_linenos_positive(self) -> None:
705 syms = _parse(self._README)
706 for addr, rec in syms.items():
707 assert rec["lineno"] > 0, f"{addr!r} lineno={rec['lineno']}"
708
709 def test_all_end_linenos_gte_lineno(self) -> None:
710 syms = _parse(self._README)
711 for addr, rec in syms.items():
712 assert rec["end_lineno"] >= rec["lineno"], (
713 f"{addr!r} end_lineno={rec['end_lineno']} < lineno={rec['lineno']}"
714 )
715
716 def test_contributing_section_extracted(self) -> None:
717 syms = _parse(self._README)
718 assert any("Contributing" in k for k in syms)
719
720 def test_commands_subsection_qualified_under_usage(self) -> None:
721 syms = _parse(self._README)
722 # "Commands" lives under "Usage", so its qualified name should
723 # contain "Usage.Commands".
724 assert any("Usage.Commands" in k for k in syms)