gabriel / muse public
test_lineage_algorithm.py python
469 lines 19.2 KB
7855ccd0 feat: harden, test, and document all quality-dial changes Gabriel Cardona <gabriel@tellurstori.com> 2d ago
1 """Comprehensive unit tests for build_lineage — the symbol provenance engine.
2
3 Coverage matrix
4 ---------------
5 Created — InsertOp with no prior live symbol sharing the content_id
6 Copied — InsertOp whose content_id matches a currently-live symbol
7 Renamed — InsertOp + DeleteOp in same commit with matching content_id, same file
8 Moved — InsertOp + DeleteOp in same commit with matching content_id, different file
9 Modified — ReplaceOp classified as impl_only / signature_change / full_rewrite
10 Deleted — DeleteOp at the target address
11 Multi-event — symbol created, modified, deleted, re-created in the same history
12 Registry — incremental content_id registry enables accurate copy detection
13 across many commits without re-parsing blobs
14 No events — address absent from all commits → empty list
15 Empty repo — no commits at all → empty list
16 """
17
18 from __future__ import annotations
19
20 import datetime
21 import json
22 import pathlib
23
24 import pytest
25
26 from muse.cli.commands.lineage import build_lineage
27 from muse.core.store import CommitRecord, write_commit
28
29
30 # ---------------------------------------------------------------------------
31 # Fixture helpers
32 # ---------------------------------------------------------------------------
33
34
35 def _make_repo(path: pathlib.Path) -> pathlib.Path:
36 muse = path / ".muse"
37 (muse / "refs" / "heads").mkdir(parents=True)
38 (muse / "objects").mkdir()
39 (muse / "commits").mkdir()
40 (muse / "snapshots").mkdir()
41 (muse / "repo.json").write_text(json.dumps({"repo_id": "test", "domain": "midi"}))
42 (muse / "HEAD").write_text("ref: refs/heads/main\n")
43 return path
44
45
46 _T0 = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
47
48
49 def _commit(
50 root: pathlib.Path,
51 commit_id: str,
52 ops: list[dict[str, str | list[dict[str, str]]]],
53 offset_days: int = 0,
54 parent: str | None = None,
55 ) -> CommitRecord:
56 """Write a CommitRecord with a structured_delta containing *ops*."""
57 committed_at = _T0 + datetime.timedelta(days=offset_days)
58 record = CommitRecord(
59 commit_id=commit_id,
60 repo_id="test",
61 branch="main",
62 snapshot_id="snap-" + commit_id,
63 message=f"commit {commit_id[:8]}",
64 committed_at=committed_at,
65 parent_commit_id=parent,
66 structured_delta={"ops": ops},
67 )
68 write_commit(root, record)
69 return record
70
71
72 def _insert_op(address: str, content_id: str) -> dict[str, str]:
73 return {"op": "insert", "address": address, "content_id": content_id}
74
75
76 def _delete_op(address: str, content_id: str) -> dict[str, str]:
77 return {"op": "delete", "address": address, "content_id": content_id}
78
79
80 def _replace_op(
81 address: str,
82 old_content_id: str,
83 new_content_id: str,
84 old_summary: str = "impl changed",
85 new_summary: str = "impl changed",
86 ) -> dict[str, str]:
87 return {
88 "op": "replace",
89 "address": address,
90 "old_content_id": old_content_id,
91 "new_content_id": new_content_id,
92 "old_summary": old_summary,
93 "new_summary": new_summary,
94 }
95
96
97 # ---------------------------------------------------------------------------
98 # Empty / no-op cases
99 # ---------------------------------------------------------------------------
100
101
102 class TestEmptyCases:
103 def test_no_commits_returns_empty(self, tmp_path: pathlib.Path) -> None:
104 root = _make_repo(tmp_path)
105 events = build_lineage(root, "src/billing.py::compute_total")
106 assert events == []
107
108 def test_address_not_in_any_commit(self, tmp_path: pathlib.Path) -> None:
109 root = _make_repo(tmp_path)
110 _commit(root, "a" * 64, [_insert_op("src/auth.py::login", "cid-login")])
111 events = build_lineage(root, "src/billing.py::compute_total")
112 assert events == []
113
114 def test_commit_with_no_structured_delta(self, tmp_path: pathlib.Path) -> None:
115 root = _make_repo(tmp_path)
116 record = CommitRecord(
117 commit_id="b" * 64,
118 repo_id="test",
119 branch="main",
120 snapshot_id="snap",
121 message="no delta",
122 committed_at=_T0,
123 structured_delta=None,
124 )
125 write_commit(root, record)
126 events = build_lineage(root, "src/main.py::main")
127 assert events == []
128
129
130 # ---------------------------------------------------------------------------
131 # Created
132 # ---------------------------------------------------------------------------
133
134
135 class TestCreated:
136 def test_first_insert_is_created(self, tmp_path: pathlib.Path) -> None:
137 root = _make_repo(tmp_path)
138 addr = "src/billing.py::compute_total"
139 _commit(root, "c" * 64, [_insert_op(addr, "cid-v1")])
140 events = build_lineage(root, addr)
141 assert len(events) == 1
142 assert events[0].kind == "created"
143
144 def test_created_event_has_correct_commit_id(self, tmp_path: pathlib.Path) -> None:
145 root = _make_repo(tmp_path)
146 addr = "src/main.py::main"
147 cid = "d" * 64
148 _commit(root, cid, [_insert_op(addr, "cid-v1")])
149 events = build_lineage(root, addr)
150 assert events[0].commit_id == cid
151
152 def test_created_event_records_content_id(self, tmp_path: pathlib.Path) -> None:
153 root = _make_repo(tmp_path)
154 addr = "src/main.py::main"
155 _commit(root, "e" * 64, [_insert_op(addr, "cid-abc")])
156 events = build_lineage(root, addr)
157 assert events[0].new_content_id == "cid-abc"
158
159
160 # ---------------------------------------------------------------------------
161 # Deleted
162 # ---------------------------------------------------------------------------
163
164
165 class TestDeleted:
166 def test_delete_after_insert_is_deleted(self, tmp_path: pathlib.Path) -> None:
167 root = _make_repo(tmp_path)
168 addr = "src/api.py::get_user"
169 _commit(root, "f" * 64, [_insert_op(addr, "cid-v1")], offset_days=0)
170 _commit(root, "a1" * 32, [_delete_op(addr, "cid-v1")], offset_days=1)
171 events = build_lineage(root, addr)
172 assert events[-1].kind == "deleted"
173
174 def test_deleted_event_records_content_id(self, tmp_path: pathlib.Path) -> None:
175 root = _make_repo(tmp_path)
176 addr = "src/api.py::delete_user"
177 _commit(root, "f1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
178 _commit(root, "f2" * 32, [_delete_op(addr, "cid-v1")], offset_days=1)
179 events = build_lineage(root, addr)
180 deleted = events[-1]
181 assert deleted.kind == "deleted"
182 assert deleted.old_content_id == "cid-v1"
183
184
185 # ---------------------------------------------------------------------------
186 # Modified
187 # ---------------------------------------------------------------------------
188
189
190 class TestModified:
191 def test_replace_op_is_modified(self, tmp_path: pathlib.Path) -> None:
192 root = _make_repo(tmp_path)
193 addr = "src/core.py::hash_content"
194 _commit(root, "g1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
195 _commit(
196 root, "g2" * 32,
197 [_replace_op(addr, "cid-v1", "cid-v2")],
198 offset_days=1,
199 )
200 events = build_lineage(root, addr)
201 modified = [e for e in events if e.kind == "modified"]
202 assert len(modified) == 1
203
204 def test_modified_signature_only(self, tmp_path: pathlib.Path) -> None:
205 root = _make_repo(tmp_path)
206 addr = "src/core.py::process"
207 _commit(root, "h1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
208 _commit(
209 root, "h2" * 32,
210 [_replace_op(addr, "cid-v1", "cid-v2", "signature changed", "signature changed")],
211 offset_days=1,
212 )
213 events = build_lineage(root, addr)
214 modified = [e for e in events if e.kind == "modified"]
215 assert modified[0].detail == "signature_change"
216
217 def test_modified_full_rewrite(self, tmp_path: pathlib.Path) -> None:
218 root = _make_repo(tmp_path)
219 addr = "src/core.py::transform"
220 _commit(root, "i1" * 32, [_insert_op(addr, "cid-aaaa")], offset_days=0)
221 _commit(
222 root, "i2" * 32,
223 [_replace_op(addr, "cid-aaaa", "cid-bbbb", "impl changed", "impl changed")],
224 offset_days=1,
225 )
226 events = build_lineage(root, addr)
227 modified = [e for e in events if e.kind == "modified"]
228 assert modified[0].detail == "full_rewrite"
229
230 def test_multiple_modifications(self, tmp_path: pathlib.Path) -> None:
231 root = _make_repo(tmp_path)
232 addr = "src/worker.py::run"
233 _commit(root, "j1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
234 _commit(root, "j2" * 32, [_replace_op(addr, "cid-v1", "cid-v2")], offset_days=1)
235 _commit(root, "j3" * 32, [_replace_op(addr, "cid-v2", "cid-v3")], offset_days=2)
236 events = build_lineage(root, addr)
237 modified = [e for e in events if e.kind == "modified"]
238 assert len(modified) == 2
239
240
241 # ---------------------------------------------------------------------------
242 # Renamed
243 # ---------------------------------------------------------------------------
244
245
246 class TestRenamed:
247 def test_insert_delete_same_file_is_renamed(self, tmp_path: pathlib.Path) -> None:
248 root = _make_repo(tmp_path)
249 old_addr = "src/billing.py::_compute_total"
250 new_addr = "src/billing.py::compute_total"
251 _commit(root, "k1" * 32, [_insert_op(old_addr, "cid-body")], offset_days=0)
252 _commit(
253 root, "k2" * 32,
254 [
255 _insert_op(new_addr, "cid-body"),
256 _delete_op(old_addr, "cid-body"),
257 ],
258 offset_days=1,
259 )
260 events = build_lineage(root, new_addr)
261 assert any(e.kind == "renamed_from" for e in events)
262
263 def test_renamed_from_detail_is_source_address(self, tmp_path: pathlib.Path) -> None:
264 root = _make_repo(tmp_path)
265 old_addr = "src/billing.py::_inner"
266 new_addr = "src/billing.py::public_api"
267 _commit(root, "l1" * 32, [_insert_op(old_addr, "cid-body")], offset_days=0)
268 _commit(
269 root, "l2" * 32,
270 [_insert_op(new_addr, "cid-body"), _delete_op(old_addr, "cid-body")],
271 offset_days=1,
272 )
273 events = build_lineage(root, new_addr)
274 renamed = [e for e in events if e.kind == "renamed_from"]
275 assert renamed[0].detail == old_addr
276
277
278 # ---------------------------------------------------------------------------
279 # Moved
280 # ---------------------------------------------------------------------------
281
282
283 class TestMoved:
284 def test_insert_delete_different_file_is_moved(self, tmp_path: pathlib.Path) -> None:
285 root = _make_repo(tmp_path)
286 old_addr = "old/billing.py::compute_invoice_total"
287 new_addr = "src/billing.py::compute_invoice_total"
288 _commit(root, "m1" * 32, [_insert_op(old_addr, "cid-body")], offset_days=0)
289 _commit(
290 root, "m2" * 32,
291 [_insert_op(new_addr, "cid-body"), _delete_op(old_addr, "cid-body")],
292 offset_days=1,
293 )
294 events = build_lineage(root, new_addr)
295 assert any(e.kind == "moved_from" for e in events)
296
297 def test_moved_from_detail_is_original_address(self, tmp_path: pathlib.Path) -> None:
298 root = _make_repo(tmp_path)
299 old_addr = "legacy/module.py::process"
300 new_addr = "src/processing.py::process"
301 _commit(root, "n1" * 32, [_insert_op(old_addr, "cid-body")], offset_days=0)
302 _commit(
303 root, "n2" * 32,
304 [_insert_op(new_addr, "cid-body"), _delete_op(old_addr, "cid-body")],
305 offset_days=1,
306 )
307 events = build_lineage(root, new_addr)
308 moved = [e for e in events if e.kind == "moved_from"]
309 assert moved[0].detail == old_addr
310
311
312 # ---------------------------------------------------------------------------
313 # Copied
314 # ---------------------------------------------------------------------------
315
316
317 class TestCopied:
318 def test_insert_matching_live_symbol_is_copied(self, tmp_path: pathlib.Path) -> None:
319 root = _make_repo(tmp_path)
320 original_addr = "src/utils.py::helper"
321 copy_addr = "src/other.py::helper"
322 shared_cid = "cid-shared-body"
323 # Commit 1: create original
324 _commit(root, "o1" * 32, [_insert_op(original_addr, shared_cid)], offset_days=0)
325 # Commit 2: insert copy (same content_id, different address, no delete)
326 _commit(root, "o2" * 32, [_insert_op(copy_addr, shared_cid)], offset_days=1)
327 events = build_lineage(root, copy_addr)
328 assert any(e.kind == "copied_from" for e in events)
329
330 def test_copied_from_detail_is_source_address(self, tmp_path: pathlib.Path) -> None:
331 root = _make_repo(tmp_path)
332 original = "src/utils.py::helper"
333 copy = "src/other.py::helper"
334 shared = "cid-shared"
335 _commit(root, "p1" * 32, [_insert_op(original, shared)], offset_days=0)
336 _commit(root, "p2" * 32, [_insert_op(copy, shared)], offset_days=1)
337 events = build_lineage(root, copy)
338 copied = [e for e in events if e.kind == "copied_from"]
339 assert copied[0].detail == original
340
341 def test_no_copy_if_source_is_dead(self, tmp_path: pathlib.Path) -> None:
342 """If the source was deleted before the copy, it should be 'created' not 'copied'."""
343 root = _make_repo(tmp_path)
344 original = "src/utils.py::helper"
345 copy = "src/other.py::helper"
346 shared = "cid-shared"
347 _commit(root, "q1" * 32, [_insert_op(original, shared)], offset_days=0)
348 _commit(root, "q2" * 32, [_delete_op(original, shared)], offset_days=1)
349 _commit(root, "q3" * 32, [_insert_op(copy, shared)], offset_days=2)
350 events = build_lineage(root, copy)
351 # After delete, the registry no longer has original as live.
352 # So re-insert at copy address should be 'created', not 'copied'.
353 insert_events = [e for e in events if e.kind in ("created", "copied_from")]
354 assert insert_events[0].kind == "created"
355
356
357 # ---------------------------------------------------------------------------
358 # Complex multi-event sequences
359 # ---------------------------------------------------------------------------
360
361
362 class TestMultiEvent:
363 def test_create_modify_delete_sequence(self, tmp_path: pathlib.Path) -> None:
364 root = _make_repo(tmp_path)
365 addr = "src/core.py::process"
366 _commit(root, "r1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
367 _commit(root, "r2" * 32, [_replace_op(addr, "cid-v1", "cid-v2")], offset_days=1)
368 _commit(root, "r3" * 32, [_delete_op(addr, "cid-v2")], offset_days=2)
369 events = build_lineage(root, addr)
370 kinds = [e.kind for e in events]
371 assert kinds == ["created", "modified", "deleted"]
372
373 def test_delete_then_recreate(self, tmp_path: pathlib.Path) -> None:
374 root = _make_repo(tmp_path)
375 addr = "src/api.py::endpoint"
376 _commit(root, "s1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
377 _commit(root, "s2" * 32, [_delete_op(addr, "cid-v1")], offset_days=1)
378 _commit(root, "s3" * 32, [_insert_op(addr, "cid-v2")], offset_days=2)
379 events = build_lineage(root, addr)
380 kinds = [e.kind for e in events]
381 assert "created" in kinds
382 assert "deleted" in kinds
383 assert kinds.count("created") == 2 or kinds[-1] == "created"
384
385 def test_ordered_by_commit_timestamp(self, tmp_path: pathlib.Path) -> None:
386 root = _make_repo(tmp_path)
387 addr = "src/main.py::main"
388 # Write commits out of temporal order — lineage must sort them.
389 _commit(root, "t2" * 32, [_replace_op(addr, "cid-v1", "cid-v2")], offset_days=2)
390 _commit(root, "t1" * 32, [_insert_op(addr, "cid-v1")], offset_days=0)
391 events = build_lineage(root, addr)
392 assert events[0].kind == "created"
393 assert events[1].kind == "modified"
394
395 def test_many_commits_accumulate_all_events(self, tmp_path: pathlib.Path) -> None:
396 root = _make_repo(tmp_path)
397 addr = "src/worker.py::run"
398 _commit(root, "u0" * 32, [_insert_op(addr, "cid-0")], offset_days=0)
399 prev = "cid-0"
400 for i in range(1, 10):
401 nxt = f"cid-{i}"
402 _commit(root, f"u{i}" * (64 // len(f"u{i}")), [_replace_op(addr, prev, nxt)], offset_days=i)
403 prev = nxt
404 events = build_lineage(root, addr)
405 assert len(events) == 10 # 1 created + 9 modified
406
407
408 # ---------------------------------------------------------------------------
409 # Incremental registry — copy detection across many commits
410 # ---------------------------------------------------------------------------
411
412
413 class TestIncrementalRegistry:
414 def test_registry_tracks_all_live_symbols(self, tmp_path: pathlib.Path) -> None:
415 """The incremental registry must track symbols in commits that don't touch the target."""
416 root = _make_repo(tmp_path)
417 shared_cid = "cid-shared"
418 # Commit 1: insert original at a *different* address (not the target).
419 _commit(root, "v1" * 32, [_insert_op("src/a.py::foo", shared_cid)], offset_days=0)
420 # Commit 2: insert the target using the same content_id → should be copied_from.
421 _commit(root, "v2" * 32, [_insert_op("src/b.py::foo", shared_cid)], offset_days=1)
422 events = build_lineage(root, "src/b.py::foo")
423 assert events[0].kind == "copied_from"
424 assert events[0].detail == "src/a.py::foo"
425
426 def test_registry_prunes_deleted_symbols(self, tmp_path: pathlib.Path) -> None:
427 """After deleting the original, its content_id must leave the live registry."""
428 root = _make_repo(tmp_path)
429 shared = "cid-shared"
430 original = "src/a.py::foo"
431 target = "src/b.py::foo"
432 _commit(root, "w1" * 32, [_insert_op(original, shared)], offset_days=0)
433 _commit(root, "w2" * 32, [_delete_op(original, shared)], offset_days=1)
434 _commit(root, "w3" * 32, [_insert_op(target, shared)], offset_days=2)
435 events = build_lineage(root, target)
436 assert events[0].kind == "created" # not copied — source is dead
437
438 def test_registry_across_ten_intermediate_commits(self, tmp_path: pathlib.Path) -> None:
439 """Original inserted in commit 1; target copied in commit 12 — registry must survive."""
440 root = _make_repo(tmp_path)
441 shared = "cid-shared"
442 original = "src/lib.py::util"
443 target = "src/app.py::util"
444
445 _commit(root, "x0" * 32, [_insert_op(original, shared)], offset_days=0)
446 # 10 unrelated commits that don't touch original or target.
447 for i in range(1, 11):
448 _commit(
449 root, f"x{i}" * (64 // len(f"x{i}")),
450 [_insert_op(f"src/other_{i}.py::fn", f"cid-other-{i}")],
451 offset_days=i,
452 )
453 # Target is inserted 11 days later — registry must still know original is live.
454 _commit(root, "x11" * 16, [_insert_op(target, shared)], offset_days=11)
455 events = build_lineage(root, target)
456 assert events[0].kind == "copied_from"
457
458 def test_json_output_shape(self, tmp_path: pathlib.Path) -> None:
459 """to_dict() must return the expected keys in correct types."""
460 root = _make_repo(tmp_path)
461 addr = "src/main.py::main"
462 _commit(root, "y1" * 32, [_insert_op(addr, "cid-abc123456789")], offset_days=0)
463 events = build_lineage(root, addr)
464 d = events[0].to_dict()
465 assert "commit_id" in d
466 assert "committed_at" in d
467 assert "event" in d
468 assert d["event"] == "created"
469 assert len(d["commit_id"]) == 8 # truncated to 8 chars