test_markdown_adapter.py
python
| 1 | """Tests for the rewritten MarkdownAdapter. |
| 2 | |
| 3 | Coverage: |
| 4 | - Extension routing: only .md / .rst / .txt are accepted. |
| 5 | - Section symbols: flat headings, hierarchical qualified names, level encoding. |
| 6 | - Content-ID correctness: full section bytes hashed, not just heading text. |
| 7 | - Body-hash / signature split: retitle detection, level-change detection. |
| 8 | - Code block symbols: language tag, no-language fallback, content hash. |
| 9 | - GFM table symbols: header signature, data-row body_hash, schema changes. |
| 10 | - Inline markup stripping: bold, italic, inline-code, links in headings. |
| 11 | - Deduplication: identical sibling headings get @L{lineno} suffix. |
| 12 | - Depth limit: sections beyond _MAX_DEPTH are silently dropped. |
| 13 | - Edge cases: empty file, no headings, setext headings (unsupported → skip). |
| 14 | - Real-world shape: README-shaped document exercises all three emitters. |
| 15 | - _plain_heading unit tests: images dropped, markup stripped, truncation. |
| 16 | """ |
| 17 | |
| 18 | from __future__ import annotations |
| 19 | |
| 20 | import pytest |
| 21 | from muse.plugins.code.ast_parser import ( |
| 22 | MarkdownAdapter, |
| 23 | SymbolRecord, |
| 24 | SymbolTree, |
| 25 | _plain_heading, |
| 26 | ) |
| 27 | |
| 28 | |
| 29 | # --------------------------------------------------------------------------- |
| 30 | # Helpers |
| 31 | # --------------------------------------------------------------------------- |
| 32 | |
| 33 | def _parse(source: str, path: str = "README.md") -> SymbolTree: |
| 34 | adapter = MarkdownAdapter() |
| 35 | if adapter._parser is None: |
| 36 | pytest.skip("tree-sitter-markdown not available") |
| 37 | return adapter.parse_symbols(source.encode(), path) |
| 38 | |
| 39 | |
| 40 | # --------------------------------------------------------------------------- |
| 41 | # _plain_heading unit tests |
| 42 | # --------------------------------------------------------------------------- |
| 43 | |
| 44 | class TestPlainHeading: |
| 45 | def test_plain_text_unchanged(self) -> None: |
| 46 | assert _plain_heading("Hello World") == "Hello World" |
| 47 | |
| 48 | def test_bold_stripped(self) -> None: |
| 49 | assert _plain_heading("**Bold** heading") == "Bold heading" |
| 50 | |
| 51 | def test_italic_star_stripped(self) -> None: |
| 52 | assert _plain_heading("*italic* text") == "italic text" |
| 53 | |
| 54 | def test_bold_italic_combined(self) -> None: |
| 55 | assert _plain_heading("***bold italic***") == "bold italic" |
| 56 | |
| 57 | def test_italic_underscore_stripped(self) -> None: |
| 58 | assert _plain_heading("_italic_") == "italic" |
| 59 | |
| 60 | def test_bold_underscore_stripped(self) -> None: |
| 61 | assert _plain_heading("__bold__") == "bold" |
| 62 | |
| 63 | def test_inline_code_stripped(self) -> None: |
| 64 | assert _plain_heading("`code` block") == "code block" |
| 65 | |
| 66 | def test_triple_backtick_stripped(self) -> None: |
| 67 | assert _plain_heading("```code```") == "code" |
| 68 | |
| 69 | def test_link_keeps_text(self) -> None: |
| 70 | assert _plain_heading("[link text](https://example.com)") == "link text" |
| 71 | |
| 72 | def test_reference_link_keeps_text(self) -> None: |
| 73 | assert _plain_heading("[link text][ref]") == "link text" |
| 74 | |
| 75 | def test_image_dropped_entirely(self) -> None: |
| 76 | assert _plain_heading(" caption") == "caption" |
| 77 | |
| 78 | def test_reference_image_dropped(self) -> None: |
| 79 | assert _plain_heading("![alt][ref] caption") == "caption" |
| 80 | |
| 81 | def test_html_entity_amp(self) -> None: |
| 82 | assert _plain_heading("foo & bar") == "foo & bar" |
| 83 | |
| 84 | def test_html_entity_lt_gt(self) -> None: |
| 85 | assert _plain_heading("a < b > c") == "a < b > c" |
| 86 | |
| 87 | def test_html_entity_quot(self) -> None: |
| 88 | assert _plain_heading("say "hi"") == 'say "hi"' |
| 89 | |
| 90 | def test_html_entity_apos(self) -> None: |
| 91 | assert _plain_heading("it's") == "it's" |
| 92 | |
| 93 | def test_whitespace_collapsed(self) -> None: |
| 94 | assert _plain_heading(" too many spaces ") == "too many spaces" |
| 95 | |
| 96 | def test_truncation_at_120_chars(self) -> None: |
| 97 | long = "A" * 200 |
| 98 | result = _plain_heading(long) |
| 99 | assert len(result) == 120 |
| 100 | |
| 101 | def test_empty_string(self) -> None: |
| 102 | assert _plain_heading("") == "" |
| 103 | |
| 104 | def test_mixed_markup(self) -> None: |
| 105 | # Realistic heading: "**API** `Reference` Guide" |
| 106 | result = _plain_heading("**API** `Reference` Guide") |
| 107 | assert result == "API Reference Guide" |
| 108 | |
| 109 | |
| 110 | # --------------------------------------------------------------------------- |
| 111 | # Extension routing |
| 112 | # --------------------------------------------------------------------------- |
| 113 | |
| 114 | class TestExtensionRouting: |
| 115 | def test_md_supported(self) -> None: |
| 116 | adapter = MarkdownAdapter() |
| 117 | assert ".md" in adapter.supported_extensions() |
| 118 | |
| 119 | def test_rst_supported(self) -> None: |
| 120 | adapter = MarkdownAdapter() |
| 121 | assert ".rst" in adapter.supported_extensions() |
| 122 | |
| 123 | def test_txt_supported(self) -> None: |
| 124 | adapter = MarkdownAdapter() |
| 125 | assert ".txt" in adapter.supported_extensions() |
| 126 | |
| 127 | def test_py_not_supported(self) -> None: |
| 128 | adapter = MarkdownAdapter() |
| 129 | assert ".py" not in adapter.supported_extensions() |
| 130 | |
| 131 | def test_html_not_supported(self) -> None: |
| 132 | adapter = MarkdownAdapter() |
| 133 | assert ".html" not in adapter.supported_extensions() |
| 134 | |
| 135 | |
| 136 | # --------------------------------------------------------------------------- |
| 137 | # Section symbols: flat headings |
| 138 | # --------------------------------------------------------------------------- |
| 139 | |
| 140 | class TestFlatSections: |
| 141 | def test_h1_emitted(self) -> None: |
| 142 | syms = _parse("# Hello\n\nContent.\n") |
| 143 | keys = list(syms) |
| 144 | assert any("Hello" in k for k in keys) |
| 145 | |
| 146 | def test_h1_kind_is_section(self) -> None: |
| 147 | syms = _parse("# Hello\n\nContent.\n") |
| 148 | rec = next(v for k, v in syms.items() if "Hello" in k) |
| 149 | assert rec["kind"] == "section" |
| 150 | |
| 151 | def test_h2_emitted(self) -> None: |
| 152 | syms = _parse("## Setup\n\nDo the thing.\n") |
| 153 | keys = list(syms) |
| 154 | assert any("Setup" in k for k in keys) |
| 155 | |
| 156 | def test_h3_emitted(self) -> None: |
| 157 | syms = _parse("### Detail\n\nMore detail.\n") |
| 158 | keys = list(syms) |
| 159 | assert any("Detail" in k for k in keys) |
| 160 | |
| 161 | def test_address_contains_file_path(self) -> None: |
| 162 | syms = _parse("# Hello\n", "docs/guide.md") |
| 163 | assert any(k.startswith("docs/guide.md::") for k in syms) |
| 164 | |
| 165 | def test_lineno_is_one_based(self) -> None: |
| 166 | syms = _parse("# Hello\n\nContent.\n") |
| 167 | rec = next(v for k, v in syms.items() if "Hello" in k) |
| 168 | assert rec["lineno"] == 1 |
| 169 | |
| 170 | def test_end_lineno_greater_than_lineno(self) -> None: |
| 171 | syms = _parse("# Hello\n\nSome content.\n") |
| 172 | rec = next(v for k, v in syms.items() if "Hello" in k) |
| 173 | assert rec["end_lineno"] >= rec["lineno"] |
| 174 | |
| 175 | def test_name_is_plain_text(self) -> None: |
| 176 | syms = _parse("# **Bold** Heading\n\nContent.\n") |
| 177 | rec = next(v for k, v in syms.items() if "Bold Heading" in k) |
| 178 | assert rec["name"] == "Bold Heading" |
| 179 | |
| 180 | |
| 181 | # --------------------------------------------------------------------------- |
| 182 | # Section symbols: hierarchy |
| 183 | # --------------------------------------------------------------------------- |
| 184 | |
| 185 | class TestSectionHierarchy: |
| 186 | def test_h2_under_h1_has_qualified_name(self) -> None: |
| 187 | src = "# Parent\n\n## Child\n\nText.\n" |
| 188 | syms = _parse(src) |
| 189 | assert any("Parent.Child" in k for k in syms) |
| 190 | |
| 191 | def test_h3_under_h2_under_h1(self) -> None: |
| 192 | src = "# A\n\n## B\n\n### C\n\nText.\n" |
| 193 | syms = _parse(src) |
| 194 | assert any("A.B.C" in k for k in syms) |
| 195 | |
| 196 | def test_sibling_h2s_are_distinct(self) -> None: |
| 197 | src = "# Root\n\n## Alpha\n\nFoo.\n\n## Beta\n\nBar.\n" |
| 198 | syms = _parse(src) |
| 199 | assert any("Alpha" in k for k in syms) |
| 200 | assert any("Beta" in k for k in syms) |
| 201 | |
| 202 | def test_h2_address_does_not_bleed_into_sibling(self) -> None: |
| 203 | src = "# Root\n\n## A\n\nFoo.\n\n## B\n\nBar.\n" |
| 204 | syms = _parse(src) |
| 205 | # "A.B" should NOT appear; B is a sibling, not a child of A. |
| 206 | assert not any("A.B" in k for k in syms) |
| 207 | |
| 208 | def test_parent_section_includes_child_in_content_id(self) -> None: |
| 209 | src_with_child = "# Parent\n\n## Child\n\nText.\n" |
| 210 | src_no_child = "# Parent\n\nText.\n" |
| 211 | syms_with = _parse(src_with_child) |
| 212 | syms_no = _parse(src_no_child) |
| 213 | parent_with = next(v for k, v in syms_with.items() if k.endswith("::Parent")) |
| 214 | parent_no = next(v for k, v in syms_no.items() if k.endswith("::Parent")) |
| 215 | # Adding a child section changes the parent's content_id. |
| 216 | assert parent_with["content_id"] != parent_no["content_id"] |
| 217 | |
| 218 | def test_parallel_h2s_in_separate_h1_sections_dont_collide(self) -> None: |
| 219 | src = "# Intro\n\n## Overview\n\nX.\n\n# Usage\n\n## Overview\n\nY.\n" |
| 220 | syms = _parse(src) |
| 221 | # Two Overview headings exist; they must have different addresses. |
| 222 | overview_keys = [k for k in syms if "Overview" in k] |
| 223 | assert len(overview_keys) == 2 |
| 224 | assert overview_keys[0] != overview_keys[1] |
| 225 | |
| 226 | |
| 227 | # --------------------------------------------------------------------------- |
| 228 | # Content-ID correctness — the core bug fix |
| 229 | # --------------------------------------------------------------------------- |
| 230 | |
| 231 | class TestContentIDCorrectness: |
| 232 | def test_changing_body_changes_content_id(self) -> None: |
| 233 | src_a = "# Intro\n\nFirst paragraph.\n" |
| 234 | src_b = "# Intro\n\nFirst paragraph changed entirely.\n" |
| 235 | a = _parse(src_a) |
| 236 | b = _parse(src_b) |
| 237 | key_a = next(k for k in a if "Intro" in k) |
| 238 | key_b = next(k for k in b if "Intro" in k) |
| 239 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 240 | |
| 241 | def test_same_content_produces_same_content_id(self) -> None: |
| 242 | src = "# Hello\n\nSame content.\n" |
| 243 | a = _parse(src) |
| 244 | b = _parse(src) |
| 245 | key = next(k for k in a if "Hello" in k) |
| 246 | assert a[key]["content_id"] == b[key]["content_id"] |
| 247 | |
| 248 | def test_adding_paragraph_changes_content_id(self) -> None: |
| 249 | src_a = "# Section\n\nParagraph one.\n" |
| 250 | src_b = "# Section\n\nParagraph one.\n\nParagraph two.\n" |
| 251 | a = _parse(src_a) |
| 252 | b = _parse(src_b) |
| 253 | key_a = next(k for k in a if "Section" in k) |
| 254 | key_b = next(k for k in b if "Section" in k) |
| 255 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 256 | |
| 257 | def test_heading_retitle_changes_content_id(self) -> None: |
| 258 | src_a = "# Old Title\n\nSame body.\n" |
| 259 | src_b = "# New Title\n\nSame body.\n" |
| 260 | a = _parse(src_a) |
| 261 | b = _parse(src_b) |
| 262 | # Different addresses (different titles) — both content_ids checked |
| 263 | key_a = next(k for k in a if "Old Title" in k) |
| 264 | key_b = next(k for k in b if "New Title" in k) |
| 265 | # content_id differs because heading text changed. |
| 266 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 267 | |
| 268 | def test_retitle_with_same_body_has_same_body_hash(self) -> None: |
| 269 | """Retitle detection: body_hash stable, signature_id changes.""" |
| 270 | src_a = "# Old Title\n\nIdentical body content.\n" |
| 271 | src_b = "# New Title\n\nIdentical body content.\n" |
| 272 | a = _parse(src_a) |
| 273 | b = _parse(src_b) |
| 274 | key_a = next(k for k in a if "Old Title" in k) |
| 275 | key_b = next(k for k in b if "New Title" in k) |
| 276 | # Same body text below heading → same body_hash. |
| 277 | assert a[key_a]["body_hash"] == b[key_b]["body_hash"] |
| 278 | # Different heading text → different signature_id. |
| 279 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 280 | |
| 281 | def test_level_change_changes_metadata_id(self) -> None: |
| 282 | """Promoting a heading level is visible in metadata_id, not body_hash.""" |
| 283 | src_a = "## Section\n\nBody.\n" |
| 284 | src_b = "# Section\n\nBody.\n" |
| 285 | a = _parse(src_a) |
| 286 | b = _parse(src_b) |
| 287 | key_a = next(k for k in a if "Section" in k) |
| 288 | key_b = next(k for k in b if "Section" in k) |
| 289 | assert a[key_a]["metadata_id"] != b[key_b]["metadata_id"] |
| 290 | # Body content is the same, so body_hash should match. |
| 291 | assert a[key_a]["body_hash"] == b[key_b]["body_hash"] |
| 292 | |
| 293 | def test_level_change_changes_signature_id(self) -> None: |
| 294 | src_a = "## Section\n\nBody.\n" |
| 295 | src_b = "# Section\n\nBody.\n" |
| 296 | a = _parse(src_a) |
| 297 | b = _parse(src_b) |
| 298 | key_a = next(k for k in a if "Section" in k) |
| 299 | key_b = next(k for k in b if "Section" in k) |
| 300 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 301 | |
| 302 | |
| 303 | # --------------------------------------------------------------------------- |
| 304 | # Fenced code blocks |
| 305 | # --------------------------------------------------------------------------- |
| 306 | |
| 307 | class TestCodeBlockSymbols: |
| 308 | def test_python_block_emitted(self) -> None: |
| 309 | src = "# Section\n\n```python\nprint('hello')\n```\n" |
| 310 | syms = _parse(src) |
| 311 | assert any("code[python]" in k for k in syms) |
| 312 | |
| 313 | def test_code_block_kind_is_variable(self) -> None: |
| 314 | src = "# Section\n\n```python\nprint('hello')\n```\n" |
| 315 | syms = _parse(src) |
| 316 | rec = next(v for k, v in syms.items() if "code[python]" in k) |
| 317 | assert rec["kind"] == "variable" |
| 318 | |
| 319 | def test_no_language_block_emitted(self) -> None: |
| 320 | src = "# Section\n\n```\nplain text\n```\n" |
| 321 | syms = _parse(src) |
| 322 | assert any("code@L" in k for k in syms) |
| 323 | |
| 324 | def test_no_language_not_in_symbol_name(self) -> None: |
| 325 | src = "# Section\n\n```\nplain text\n```\n" |
| 326 | syms = _parse(src) |
| 327 | # Should be code@L... not code[]@L... |
| 328 | assert not any("code[]" in k for k in syms) |
| 329 | |
| 330 | def test_code_block_scoped_to_section(self) -> None: |
| 331 | src = "# Intro\n\n```python\nx = 1\n```\n" |
| 332 | syms = _parse(src) |
| 333 | # code block address should contain the parent section name |
| 334 | assert any("Intro" in k and "code[python]" in k for k in syms) |
| 335 | |
| 336 | def test_code_content_change_changes_content_id(self) -> None: |
| 337 | src_a = "# S\n\n```python\nx = 1\n```\n" |
| 338 | src_b = "# S\n\n```python\nx = 2\n```\n" |
| 339 | a = _parse(src_a) |
| 340 | b = _parse(src_b) |
| 341 | key_a = next(k for k in a if "code[python]" in k) |
| 342 | key_b = next(k for k in b if "code[python]" in k) |
| 343 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 344 | |
| 345 | def test_lang_change_changes_signature_id(self) -> None: |
| 346 | src_a = "# S\n\n```python\nx = 1\n```\n" |
| 347 | src_b = "# S\n\n```javascript\nx = 1\n```\n" |
| 348 | a = _parse(src_a) |
| 349 | b = _parse(src_b) |
| 350 | key_a = next(k for k in a if "code[python]" in k) |
| 351 | key_b = next(k for k in b if "code[javascript]" in k) |
| 352 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 353 | |
| 354 | def test_lang_tag_is_lowercased(self) -> None: |
| 355 | src = "# S\n\n```Python\npass\n```\n" |
| 356 | syms = _parse(src) |
| 357 | # Language tag must be lowercased in the symbol name. |
| 358 | assert any("code[python]" in k for k in syms) |
| 359 | |
| 360 | def test_multiple_code_blocks_are_distinct(self) -> None: |
| 361 | src = ( |
| 362 | "# Section\n\n" |
| 363 | "```python\nblock_one = 1\n```\n\n" |
| 364 | "```python\nblock_two = 2\n```\n" |
| 365 | ) |
| 366 | syms = _parse(src) |
| 367 | code_keys = [k for k in syms if "code[python]" in k] |
| 368 | assert len(code_keys) == 2 |
| 369 | assert code_keys[0] != code_keys[1] |
| 370 | |
| 371 | def test_code_block_lineno_populated(self) -> None: |
| 372 | src = "# Section\n\n```python\npass\n```\n" |
| 373 | syms = _parse(src) |
| 374 | rec = next(v for k, v in syms.items() if "code[python]" in k) |
| 375 | assert rec["lineno"] > 0 |
| 376 | |
| 377 | |
| 378 | # --------------------------------------------------------------------------- |
| 379 | # GFM pipe tables |
| 380 | # --------------------------------------------------------------------------- |
| 381 | |
| 382 | class TestTableSymbols: |
| 383 | _TABLE_SRC = ( |
| 384 | "# Section\n\n" |
| 385 | "| Name | Value |\n" |
| 386 | "| ---- | ----- |\n" |
| 387 | "| foo | 1 |\n" |
| 388 | "| bar | 2 |\n" |
| 389 | ) |
| 390 | |
| 391 | def test_table_emitted(self) -> None: |
| 392 | syms = _parse(self._TABLE_SRC) |
| 393 | assert any("table@L" in k for k in syms) |
| 394 | |
| 395 | def test_table_kind_is_section(self) -> None: |
| 396 | syms = _parse(self._TABLE_SRC) |
| 397 | rec = next(v for k, v in syms.items() if "table@L" in k) |
| 398 | assert rec["kind"] == "section" |
| 399 | |
| 400 | def test_table_scoped_to_section(self) -> None: |
| 401 | syms = _parse(self._TABLE_SRC) |
| 402 | assert any("Section" in k and "table@L" in k for k in syms) |
| 403 | |
| 404 | def test_adding_data_row_changes_content_id(self) -> None: |
| 405 | src_a = ( |
| 406 | "# S\n\n" |
| 407 | "| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 408 | ) |
| 409 | src_b = ( |
| 410 | "# S\n\n" |
| 411 | "| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n" |
| 412 | ) |
| 413 | a = _parse(src_a) |
| 414 | b = _parse(src_b) |
| 415 | key_a = next(k for k in a if "table@L" in k) |
| 416 | key_b = next(k for k in b if "table@L" in k) |
| 417 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 418 | |
| 419 | def test_adding_data_row_changes_body_hash(self) -> None: |
| 420 | src_a = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 421 | src_b = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n" |
| 422 | a = _parse(src_a) |
| 423 | b = _parse(src_b) |
| 424 | key_a = next(k for k in a if "table@L" in k) |
| 425 | key_b = next(k for k in b if "table@L" in k) |
| 426 | assert a[key_a]["body_hash"] != b[key_b]["body_hash"] |
| 427 | |
| 428 | def test_column_rename_changes_signature_id(self) -> None: |
| 429 | src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n" |
| 430 | src_b = "# S\n\n| Label | Value |\n| ----- | ----- |\n| x | 1 |\n" |
| 431 | a = _parse(src_a) |
| 432 | b = _parse(src_b) |
| 433 | key_a = next(k for k in a if "table@L" in k) |
| 434 | key_b = next(k for k in b if "table@L" in k) |
| 435 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 436 | |
| 437 | def test_column_rename_does_not_change_body_hash(self) -> None: |
| 438 | """Renaming a column header should change signature_id but not body_hash.""" |
| 439 | src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n" |
| 440 | src_b = "# S\n\n| Label | Value |\n| ------ | ----- |\n| x | 1 |\n" |
| 441 | a = _parse(src_a) |
| 442 | b = _parse(src_b) |
| 443 | key_a = next(k for k in a if "table@L" in k) |
| 444 | key_b = next(k for k in b if "table@L" in k) |
| 445 | # Data rows are the same → body_hash must be equal. |
| 446 | assert a[key_a]["body_hash"] == b[key_b]["body_hash"] |
| 447 | |
| 448 | def test_table_lineno_populated(self) -> None: |
| 449 | syms = _parse(self._TABLE_SRC) |
| 450 | rec = next(v for k, v in syms.items() if "table@L" in k) |
| 451 | assert rec["lineno"] > 0 |
| 452 | |
| 453 | |
| 454 | # --------------------------------------------------------------------------- |
| 455 | # Inline markup stripping — address stability |
| 456 | # --------------------------------------------------------------------------- |
| 457 | |
| 458 | class TestInlineMarkupStripping: |
| 459 | def test_bold_heading_address_matches_plain(self) -> None: |
| 460 | src_bold = "# **Setup**\n\nContent.\n" |
| 461 | src_plain = "# Setup\n\nContent.\n" |
| 462 | syms_bold = _parse(src_bold) |
| 463 | syms_plain = _parse(src_plain) |
| 464 | # Both should produce a key containing "Setup" (not **Setup**). |
| 465 | assert any("Setup" in k for k in syms_bold) |
| 466 | assert any("Setup" in k for k in syms_plain) |
| 467 | # The qualified name in both should be identical. |
| 468 | name_bold = next(v for k, v in syms_bold.items() if "Setup" in k)["name"] |
| 469 | name_plain = next(v for k, v in syms_plain.items() if "Setup" in k)["name"] |
| 470 | assert name_bold == name_plain |
| 471 | |
| 472 | def test_inline_code_heading_stripped(self) -> None: |
| 473 | src = "# `muse init` Command\n\nContent.\n" |
| 474 | syms = _parse(src) |
| 475 | assert any("muse init Command" in k for k in syms) |
| 476 | |
| 477 | def test_link_heading_keeps_text(self) -> None: |
| 478 | src = "# [API Reference](https://example.com/api)\n\nContent.\n" |
| 479 | syms = _parse(src) |
| 480 | assert any("API Reference" in k for k in syms) |
| 481 | |
| 482 | def test_image_in_heading_dropped(self) -> None: |
| 483 | src = "#  Intro\n\nContent.\n" |
| 484 | syms = _parse(src) |
| 485 | # The logo image should be gone; "Intro" should remain. |
| 486 | assert any("Intro" in k for k in syms) |
| 487 | assert not any("logo.png" in k for k in syms) |
| 488 | |
| 489 | |
| 490 | # --------------------------------------------------------------------------- |
| 491 | # Deduplication |
| 492 | # --------------------------------------------------------------------------- |
| 493 | |
| 494 | class TestDeduplication: |
| 495 | def test_two_identical_h2s_get_unique_addresses(self) -> None: |
| 496 | src = ( |
| 497 | "# Root\n\n" |
| 498 | "## Examples\n\nFirst.\n\n" |
| 499 | "## Examples\n\nSecond.\n" |
| 500 | ) |
| 501 | syms = _parse(src) |
| 502 | examples_keys = [k for k in syms if "Examples" in k] |
| 503 | assert len(examples_keys) == 2 |
| 504 | assert examples_keys[0] != examples_keys[1] |
| 505 | |
| 506 | def test_deduplicated_key_contains_lineno(self) -> None: |
| 507 | src = ( |
| 508 | "# Root\n\n" |
| 509 | "## Examples\n\nFirst.\n\n" |
| 510 | "## Examples\n\nSecond.\n" |
| 511 | ) |
| 512 | syms = _parse(src) |
| 513 | examples_keys = [k for k in syms if "Examples" in k] |
| 514 | # One of the two keys must have @L appended. |
| 515 | assert any("@L" in k for k in examples_keys) |
| 516 | |
| 517 | def test_identical_headings_in_different_parents_not_deduped(self) -> None: |
| 518 | src = ( |
| 519 | "# Alpha\n\n## Notes\n\nFoo.\n\n" |
| 520 | "# Beta\n\n## Notes\n\nBar.\n" |
| 521 | ) |
| 522 | syms = _parse(src) |
| 523 | notes_keys = [k for k in syms if "Notes" in k] |
| 524 | assert len(notes_keys) == 2 |
| 525 | # Should be Alpha.Notes and Beta.Notes — no @L suffix needed. |
| 526 | assert any("Alpha.Notes" in k for k in notes_keys) |
| 527 | assert any("Beta.Notes" in k for k in notes_keys) |
| 528 | |
| 529 | |
| 530 | # --------------------------------------------------------------------------- |
| 531 | # Depth limit |
| 532 | # --------------------------------------------------------------------------- |
| 533 | |
| 534 | class TestDepthLimit: |
| 535 | def test_deep_nesting_does_not_crash(self) -> None: |
| 536 | # Build 20 levels of nesting: # A, ## A.B, ### A.B.C, etc. |
| 537 | levels = ["#" * i + f" Level{i}\n\nText.\n\n" for i in range(1, 21)] |
| 538 | src = "".join(levels) |
| 539 | # Should not raise; may return fewer symbols than levels. |
| 540 | syms = _parse(src) |
| 541 | assert isinstance(syms, dict) |
| 542 | |
| 543 | def test_symbols_within_limit_are_extracted(self) -> None: |
| 544 | # Only 3 levels — all should be extracted. |
| 545 | src = "# A\n\n## A B\n\n### A B C\n\nText.\n" |
| 546 | syms = _parse(src) |
| 547 | assert any("A" in k for k in syms) |
| 548 | |
| 549 | |
| 550 | # --------------------------------------------------------------------------- |
| 551 | # Edge cases |
| 552 | # --------------------------------------------------------------------------- |
| 553 | |
| 554 | class TestEdgeCases: |
| 555 | def test_empty_file_returns_empty(self) -> None: |
| 556 | adapter = MarkdownAdapter() |
| 557 | if adapter._parser is None: |
| 558 | pytest.skip("tree-sitter-markdown not available") |
| 559 | result = adapter.parse_symbols(b"", "empty.md") |
| 560 | assert result == {} |
| 561 | |
| 562 | def test_no_headings_returns_empty(self) -> None: |
| 563 | src = "Just a paragraph with no headings.\n" |
| 564 | syms = _parse(src) |
| 565 | assert syms == {} |
| 566 | |
| 567 | def test_only_horizontal_rule_returns_empty(self) -> None: |
| 568 | src = "---\n" |
| 569 | syms = _parse(src) |
| 570 | assert syms == {} |
| 571 | |
| 572 | def test_binary_like_content_does_not_crash(self) -> None: |
| 573 | adapter = MarkdownAdapter() |
| 574 | if adapter._parser is None: |
| 575 | pytest.skip("tree-sitter-markdown not available") |
| 576 | # Non-UTF-8 bytes should not raise. |
| 577 | result = adapter.parse_symbols(b"\xff\xfe# Title\n", "weird.md") |
| 578 | assert isinstance(result, dict) |
| 579 | |
| 580 | def test_very_long_heading_truncated_in_name(self) -> None: |
| 581 | long_heading = "Word " * 50 # 250 chars |
| 582 | src = f"# {long_heading}\n\nContent.\n" |
| 583 | syms = _parse(src) |
| 584 | assert len(syms) == 1 |
| 585 | rec = next(iter(syms.values())) |
| 586 | # name must be at most 120 chars. |
| 587 | assert len(rec["name"]) <= 120 |
| 588 | |
| 589 | def test_file_content_id_changes_on_any_change(self) -> None: |
| 590 | adapter = MarkdownAdapter() |
| 591 | src_a = b"# Hello\n\nWorld.\n" |
| 592 | src_b = b"# Hello\n\nWorld. " # trailing space |
| 593 | assert adapter.file_content_id(src_a) != adapter.file_content_id(src_b) |
| 594 | |
| 595 | def test_file_content_id_is_hex_sha256(self) -> None: |
| 596 | adapter = MarkdownAdapter() |
| 597 | cid = adapter.file_content_id(b"# Hello\n") |
| 598 | assert len(cid) == 64 |
| 599 | assert all(c in "0123456789abcdef" for c in cid) |
| 600 | |
| 601 | def test_headings_only_no_body(self) -> None: |
| 602 | src = "# Title\n## Subtitle\n" |
| 603 | syms = _parse(src) |
| 604 | assert any("Title" in k for k in syms) |
| 605 | |
| 606 | def test_code_block_at_root_level(self) -> None: |
| 607 | """A code block not inside any section gets a root-level address.""" |
| 608 | src = "```python\nprint('hi')\n```\n" |
| 609 | syms = _parse(src) |
| 610 | # Should be emitted even without a parent section. |
| 611 | assert any("code[python]" in k for k in syms) |
| 612 | |
| 613 | def test_table_at_root_level(self) -> None: |
| 614 | src = "| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 615 | syms = _parse(src) |
| 616 | assert any("table@L" in k for k in syms) |
| 617 | |
| 618 | |
| 619 | # --------------------------------------------------------------------------- |
| 620 | # Real-world README shape |
| 621 | # --------------------------------------------------------------------------- |
| 622 | |
| 623 | class TestRealWorldShape: |
| 624 | _README = """\ |
| 625 | # Muse |
| 626 | |
| 627 | A domain-agnostic version control system. |
| 628 | |
| 629 | ## Installation |
| 630 | |
| 631 | ```bash |
| 632 | pip install muse-vcs |
| 633 | ``` |
| 634 | |
| 635 | ## Usage |
| 636 | |
| 637 | Run `muse init` to initialise a repository. |
| 638 | |
| 639 | ### Commands |
| 640 | |
| 641 | | Command | Description | |
| 642 | | -------------- | ------------------------- | |
| 643 | | `muse init` | Initialise a repository | |
| 644 | | `muse commit` | Record a new snapshot | |
| 645 | | `muse log` | Show commit history | |
| 646 | |
| 647 | ## API Reference |
| 648 | |
| 649 | ### `muse.core.snapshot` |
| 650 | |
| 651 | Snapshot hashing and workdir diffing. |
| 652 | |
| 653 | ```python |
| 654 | from muse.core import snapshot |
| 655 | snap = snapshot.build(root) |
| 656 | ``` |
| 657 | |
| 658 | ## Contributing |
| 659 | |
| 660 | See CONTRIBUTING.md for guidelines. |
| 661 | """ |
| 662 | |
| 663 | def test_top_level_sections_extracted(self) -> None: |
| 664 | syms = _parse(self._README) |
| 665 | top = [k for k in syms if "::" in k] |
| 666 | names = [k.split("::")[-1] for k in top] |
| 667 | assert "Muse" in names or any("Muse" in n for n in names) |
| 668 | |
| 669 | def test_installation_section_extracted(self) -> None: |
| 670 | syms = _parse(self._README) |
| 671 | assert any("Installation" in k for k in syms) |
| 672 | |
| 673 | def test_usage_commands_table_extracted(self) -> None: |
| 674 | syms = _parse(self._README) |
| 675 | assert any("table@L" in k for k in syms) |
| 676 | |
| 677 | def test_bash_code_block_extracted(self) -> None: |
| 678 | syms = _parse(self._README) |
| 679 | assert any("code[bash]" in k for k in syms) |
| 680 | |
| 681 | def test_python_code_block_extracted(self) -> None: |
| 682 | syms = _parse(self._README) |
| 683 | assert any("code[python]" in k for k in syms) |
| 684 | |
| 685 | def test_api_reference_subsection_extracted(self) -> None: |
| 686 | syms = _parse(self._README) |
| 687 | assert any("API Reference" in k for k in syms) |
| 688 | |
| 689 | def test_all_symbol_records_have_required_keys(self) -> None: |
| 690 | syms = _parse(self._README) |
| 691 | required = { |
| 692 | "kind", "name", "qualified_name", "content_id", "body_hash", |
| 693 | "signature_id", "metadata_id", "canonical_key", "lineno", "end_lineno", |
| 694 | } |
| 695 | for addr, rec in syms.items(): |
| 696 | missing = required - set(rec.keys()) |
| 697 | assert not missing, f"{addr!r} missing keys: {missing}" |
| 698 | |
| 699 | def test_no_symbol_has_empty_content_id(self) -> None: |
| 700 | syms = _parse(self._README) |
| 701 | for addr, rec in syms.items(): |
| 702 | assert rec["content_id"], f"{addr!r} has empty content_id" |
| 703 | |
| 704 | def test_all_linenos_positive(self) -> None: |
| 705 | syms = _parse(self._README) |
| 706 | for addr, rec in syms.items(): |
| 707 | assert rec["lineno"] > 0, f"{addr!r} lineno={rec['lineno']}" |
| 708 | |
| 709 | def test_all_end_linenos_gte_lineno(self) -> None: |
| 710 | syms = _parse(self._README) |
| 711 | for addr, rec in syms.items(): |
| 712 | assert rec["end_lineno"] >= rec["lineno"], ( |
| 713 | f"{addr!r} end_lineno={rec['end_lineno']} < lineno={rec['lineno']}" |
| 714 | ) |
| 715 | |
| 716 | def test_contributing_section_extracted(self) -> None: |
| 717 | syms = _parse(self._README) |
| 718 | assert any("Contributing" in k for k in syms) |
| 719 | |
| 720 | def test_commands_subsection_qualified_under_usage(self) -> None: |
| 721 | syms = _parse(self._README) |
| 722 | # "Commands" lives under "Usage", so its qualified name should |
| 723 | # contain "Usage.Commands". |
| 724 | assert any("Usage.Commands" in k for k in syms) |