gabriel / muse public
symbol_diff.py python
581 lines 22.5 KB
1afd410c fix: detect file-level move+edit as a single PatchOp Gabriel Cardona <gabriel@tellurstori.com> 3d ago
1 """Symbol-level diff engine for the code domain plugin.
2
3 Produces typed :class:`~muse.domain.DomainOp` entries at symbol granularity
4 rather than file granularity. This allows Muse to report which *functions*
5 were added, removed, renamed, or modified — not just which *files* changed.
6
7 Operation types produced
8 ------------------------
9 ``InsertOp``
10 A symbol was added to the file (new function, class, etc.).
11
12 ``DeleteOp``
13 A symbol was removed from the file.
14
15 ``ReplaceOp``
16 A symbol's content changed. The ``old_summary`` / ``new_summary`` fields
17 describe the nature of the change:
18
19 - ``"renamed to <name>"`` — same body, different name (rename detected
20 via matching ``body_hash``).
21 - ``"signature changed"`` — same body, different signature.
22 - ``"implementation changed"`` — same signature, different body.
23 - ``"modified"`` — both signature and body changed.
24
25 ``MoveOp``
26 Reserved for intra-file positional moves (used when a symbol's address
27 is unchanged but it is detected elsewhere via ``content_id``).
28
29 Cross-file move detection
30 -------------------------
31 When a symbol disappears from one file and appears in another with an
32 identical ``content_id``, the diff engine annotates the ``DeleteOp`` and
33 ``InsertOp`` ``content_summary`` fields to indicate the move direction. No
34 special op type is introduced — the existing :class:`~muse.domain.InsertOp`
35 and :class:`~muse.domain.DeleteOp` suffice because their addresses already
36 encode the file path.
37
38 Algorithm
39 ---------
40 1. Partition symbol addresses into ``added``, ``removed``, and ``common``.
41 2. Build ``body_hash → address`` reverse maps for added and removed sets.
42 3. For each ``removed`` symbol:
43
44 a. If ``content_id`` matches an ``added`` symbol → **exact move/copy**
45 (same name, same body, different file).
46 b. Else if ``body_hash`` matches an ``added`` symbol → **rename**
47 (same body, different name).
48
49 4. Emit ``ReplaceOp`` for renames; pair the cross-file move partners via
50 ``content_summary``.
51 5. Emit ``DeleteOp`` for genuinely removed symbols.
52 6. Emit ``InsertOp`` for genuinely added symbols.
53 7. Emit ``ReplaceOp`` for symbols whose ``content_id`` changed.
54 """
55
56 from __future__ import annotations
57
58 import logging
59
60 from muse.domain import DeleteOp, DomainOp, InsertOp, PatchOp, ReplaceOp
61 from muse.plugins.code.ast_parser import SymbolRecord, SymbolTree
62
63 logger = logging.getLogger(__name__)
64
65 _CHILD_DOMAIN = "code_symbols"
66
67
68 # ---------------------------------------------------------------------------
69 # Symbol-level diff within a single file
70 # ---------------------------------------------------------------------------
71
72
73 def diff_symbol_trees(
74 base: SymbolTree,
75 target: SymbolTree,
76 ) -> list[DomainOp]:
77 """Compute symbol-level ops transforming *base* into *target*.
78
79 Both trees must be scoped to the same file (their addresses share the
80 same ``"<file_path>::"`` prefix).
81
82 Args:
83 base: Symbol tree of the base (older) version of the file.
84 target: Symbol tree of the target (newer) version of the file.
85
86 Returns:
87 Ordered list of :class:`~muse.domain.DomainOp` entries.
88 """
89 base_addrs = set(base)
90 target_addrs = set(target)
91 added: set[str] = target_addrs - base_addrs
92 removed: set[str] = base_addrs - target_addrs
93 common: set[str] = base_addrs & target_addrs
94
95 # Reverse maps for rename / move detection.
96 added_by_content: dict[str, str] = {
97 target[a]["content_id"]: a for a in added
98 }
99 added_by_body: dict[str, str] = {
100 target[a]["body_hash"]: a for a in added
101 }
102
103 ops: list[DomainOp] = []
104 # Addresses claimed by rename / move detection — excluded from plain ops.
105 matched_removed: set[str] = set()
106 matched_added: set[str] = set()
107
108 # ── Pass 1: renames (same body_hash, different name) ──────────────────
109 for rem_addr in sorted(removed):
110 base_rec = base[rem_addr]
111
112 if base_rec["content_id"] in added_by_content:
113 # Exact content match at a different address → same symbol moved
114 # (intra-file positional moves don't produce a different address;
115 # this catches cross-file moves surfaced within a single-file diff
116 # when the caller slices the tree incorrectly — uncommon).
117 tgt_addr = added_by_content[base_rec["content_id"]]
118 tgt_rec = target[tgt_addr]
119 ops.append(ReplaceOp(
120 op="replace",
121 address=rem_addr,
122 position=None,
123 old_content_id=base_rec["content_id"],
124 new_content_id=tgt_rec["content_id"],
125 old_summary=f"{base_rec['kind']} {base_rec['name']}",
126 new_summary=f"moved to {tgt_rec['qualified_name']}",
127 ))
128 matched_removed.add(rem_addr)
129 matched_added.add(tgt_addr)
130
131 elif (
132 base_rec["body_hash"] in added_by_body
133 and added_by_body[base_rec["body_hash"]] not in matched_added
134 ):
135 # Same body, different name → rename.
136 tgt_addr = added_by_body[base_rec["body_hash"]]
137 tgt_rec = target[tgt_addr]
138 ops.append(ReplaceOp(
139 op="replace",
140 address=rem_addr,
141 position=None,
142 old_content_id=base_rec["content_id"],
143 new_content_id=tgt_rec["content_id"],
144 old_summary=f"{base_rec['kind']} {base_rec['name']}",
145 new_summary=f"renamed to {tgt_rec['name']}",
146 ))
147 matched_removed.add(rem_addr)
148 matched_added.add(tgt_addr)
149
150 # ── Pass 2: plain deletions ────────────────────────────────────────────
151 for rem_addr in sorted(removed - matched_removed):
152 rec = base[rem_addr]
153 ops.append(DeleteOp(
154 op="delete",
155 address=rem_addr,
156 position=None,
157 content_id=rec["content_id"],
158 content_summary=f"removed {rec['kind']} {rec['name']}",
159 ))
160
161 # ── Pass 3: plain additions ────────────────────────────────────────────
162 for add_addr in sorted(added - matched_added):
163 rec = target[add_addr]
164 ops.append(InsertOp(
165 op="insert",
166 address=add_addr,
167 position=None,
168 content_id=rec["content_id"],
169 content_summary=f"added {rec['kind']} {rec['name']}",
170 ))
171
172 # ── Pass 4: modifications ──────────────────────────────────────────────
173 for addr in sorted(common):
174 base_rec = base[addr]
175 tgt_rec = target[addr]
176 if base_rec["content_id"] == tgt_rec["content_id"]:
177 continue # unchanged
178
179 if base_rec["body_hash"] == tgt_rec["body_hash"]:
180 # Body unchanged — signature changed (type annotations, defaults…).
181 old_summary = f"{base_rec['kind']} {base_rec['name']} (signature changed)"
182 new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (signature updated)"
183 elif base_rec["signature_id"] == tgt_rec["signature_id"]:
184 # Signature unchanged — implementation changed.
185 old_summary = f"{base_rec['kind']} {base_rec['name']} (implementation)"
186 new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (implementation changed)"
187 else:
188 # Both signature and body changed.
189 old_summary = f"{base_rec['kind']} {base_rec['name']}"
190 new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (modified)"
191
192 ops.append(ReplaceOp(
193 op="replace",
194 address=addr,
195 position=None,
196 old_content_id=base_rec["content_id"],
197 new_content_id=tgt_rec["content_id"],
198 old_summary=old_summary,
199 new_summary=new_summary,
200 ))
201
202 return ops
203
204
205 # ---------------------------------------------------------------------------
206 # Cross-file diff: build the full op list for a snapshot pair
207 # ---------------------------------------------------------------------------
208
209
210 def build_diff_ops(
211 base_files: dict[str, str],
212 target_files: dict[str, str],
213 base_trees: dict[str, SymbolTree],
214 target_trees: dict[str, SymbolTree],
215 ) -> list[DomainOp]:
216 """Build the complete op list transforming *base* snapshot into *target*.
217
218 For each changed file:
219
220 - **No symbol trees available**: coarse ``InsertOp`` / ``DeleteOp`` /
221 ``ReplaceOp`` at file level.
222 - **Symbol trees available for both sides**: ``PatchOp`` with symbol-level
223 ``child_ops``. If all symbols are unchanged (formatting-only change)
224 a ``ReplaceOp`` with ``"reformatted"`` summary is emitted instead.
225 - **Symbol tree available for one side only** (new or deleted file):
226 ``PatchOp`` listing each symbol individually.
227
228 Cross-file move annotation
229 --------------------------
230 After building per-file ops, a second pass checks whether any symbol
231 ``content_id`` appears in both a ``DeleteOp`` child op and an ``InsertOp``
232 child op across *different* files. When found, both ops' ``content_summary``
233 fields are annotated with the move direction.
234
235 Args:
236 base_files: ``{path: raw_bytes_hash}`` from the base snapshot.
237 target_files: ``{path: raw_bytes_hash}`` from the target snapshot.
238 base_trees: Symbol trees for changed base files, keyed by path.
239 target_trees: Symbol trees for changed target files, keyed by path.
240
241 Returns:
242 Ordered list of ``DomainOp`` entries.
243 """
244 base_paths = set(base_files)
245 target_paths = set(target_files)
246 added_paths = sorted(target_paths - base_paths)
247 removed_paths = sorted(base_paths - target_paths)
248 modified_paths = sorted(
249 p for p in base_paths & target_paths
250 if base_files[p] != target_files[p]
251 )
252
253 # Detect file-level move+edits before emitting per-file ops so we can
254 # suppress the plain added/removed ops for those paths.
255 move_map = _detect_file_move_edits(
256 added_paths, removed_paths, base_trees, target_trees
257 )
258 moved_old = set(move_map)
259 moved_new = set(move_map.values())
260
261 ops: list[DomainOp] = []
262
263 # ── Added files (excluding move+edit targets) ──────────────────────────
264 for path in added_paths:
265 if path in moved_new:
266 continue
267 tree = target_trees.get(path, {})
268 if tree:
269 child_ops: list[DomainOp] = [
270 InsertOp(
271 op="insert",
272 address=addr,
273 position=None,
274 content_id=rec["content_id"],
275 content_summary=f"added {rec['kind']} {rec['name']}",
276 )
277 for addr, rec in sorted(tree.items())
278 ]
279 ops.append(_patch(path, child_ops))
280 else:
281 ops.append(InsertOp(
282 op="insert",
283 address=path,
284 position=None,
285 content_id=target_files[path],
286 content_summary=f"added {path}",
287 ))
288
289 # ── Removed files (excluding move+edit sources) ────────────────────────
290 for path in removed_paths:
291 if path in moved_old:
292 continue
293 tree = base_trees.get(path, {})
294 if tree:
295 child_ops = [
296 DeleteOp(
297 op="delete",
298 address=addr,
299 position=None,
300 content_id=rec["content_id"],
301 content_summary=f"removed {rec['kind']} {rec['name']}",
302 )
303 for addr, rec in sorted(tree.items())
304 ]
305 ops.append(_patch(path, child_ops))
306 else:
307 ops.append(DeleteOp(
308 op="delete",
309 address=path,
310 position=None,
311 content_id=base_files[path],
312 content_summary=f"removed {path}",
313 ))
314
315 # ── Modified files ─────────────────────────────────────────────────────
316 for path in modified_paths:
317 base_tree = base_trees.get(path, {})
318 target_tree = target_trees.get(path, {})
319
320 if base_tree or target_tree:
321 child_ops = diff_symbol_trees(base_tree, target_tree)
322 if child_ops:
323 ops.append(_patch(path, child_ops))
324 else:
325 # All symbols have the same content_id — formatting-only change.
326 ops.append(ReplaceOp(
327 op="replace",
328 address=path,
329 position=None,
330 old_content_id=base_files[path],
331 new_content_id=target_files[path],
332 old_summary=f"{path} (before)",
333 new_summary=f"{path} (reformatted — no semantic change)",
334 ))
335 else:
336 ops.append(ReplaceOp(
337 op="replace",
338 address=path,
339 position=None,
340 old_content_id=base_files[path],
341 new_content_id=target_files[path],
342 old_summary=f"{path} (before)",
343 new_summary=f"{path} (after)",
344 ))
345
346 # ── Move+edit files ────────────────────────────────────────────────────
347 for old_path, new_path in sorted(move_map.items()):
348 old_tree = base_trees.get(old_path, {})
349 new_tree = target_trees.get(new_path, {})
350 child_ops = diff_symbol_trees(old_tree, new_tree)
351
352 n_added = sum(1 for o in child_ops if o["op"] == "insert")
353 n_removed = sum(1 for o in child_ops if o["op"] == "delete")
354 n_modified = sum(1 for o in child_ops if o["op"] == "replace")
355 sym_parts: list[str] = []
356 if n_added:
357 sym_parts.append(f"{n_added} added")
358 if n_removed:
359 sym_parts.append(f"{n_removed} removed")
360 if n_modified:
361 sym_parts.append(f"{n_modified} modified")
362 child_summary = f"moved from {old_path}"
363 if sym_parts:
364 child_summary += f"; {', '.join(sym_parts)}"
365
366 ops.append(PatchOp(
367 op="patch",
368 address=new_path,
369 from_address=old_path,
370 child_ops=child_ops,
371 child_domain=_CHILD_DOMAIN,
372 child_summary=child_summary,
373 ))
374
375 _annotate_cross_file_moves(ops)
376 return ops
377
378
379 def _patch(path: str, child_ops: list[DomainOp]) -> PatchOp:
380 """Wrap symbol child_ops in a file-level PatchOp."""
381 n_added = sum(1 for o in child_ops if o["op"] == "insert")
382 n_removed = sum(1 for o in child_ops if o["op"] == "delete")
383 n_modified = sum(1 for o in child_ops if o["op"] == "replace")
384 parts: list[str] = []
385 if n_added:
386 parts.append(f"{n_added} symbol{'s' if n_added > 1 else ''} added")
387 if n_removed:
388 parts.append(f"{n_removed} symbol{'s' if n_removed > 1 else ''} removed")
389 if n_modified:
390 parts.append(f"{n_modified} symbol{'s' if n_modified > 1 else ''} modified")
391 summary = ", ".join(parts) if parts else "no symbol changes"
392 return PatchOp(
393 op="patch",
394 address=path,
395 child_ops=child_ops,
396 child_domain=_CHILD_DOMAIN,
397 child_summary=summary,
398 )
399
400
401 def _detect_file_move_edits(
402 added_paths: list[str],
403 removed_paths: list[str],
404 base_trees: dict[str, SymbolTree],
405 target_trees: dict[str, SymbolTree],
406 min_overlap: float = 0.5,
407 ) -> dict[str, str]:
408 """Return a mapping of old_path → new_path for file-level move+edits.
409
410 A file is considered moved-and-edited when the two symbol trees share at
411 least ``min_overlap`` fraction of symbols by ``body_hash`` (computed
412 against the smaller set). This mirrors the symbol-level rename heuristic
413 applied cross-file.
414
415 Each old_path and new_path is used at most once (greedy, highest-overlap
416 pair wins when multiple candidates exist).
417
418 Args:
419 added_paths: Paths present in target but not in base.
420 removed_paths: Paths present in base but not in target.
421 base_trees: Symbol trees for changed base files.
422 target_trees: Symbol trees for changed target files.
423 min_overlap: Minimum fraction of matching body_hashes required.
424
425 Returns:
426 ``{old_path: new_path}`` for each detected move+edit pair.
427 """
428 base_hashes: dict[str, set[str]] = {
429 p: {rec["body_hash"] for rec in base_trees[p].values()}
430 for p in removed_paths
431 if p in base_trees and base_trees[p]
432 }
433 target_hashes: dict[str, set[str]] = {
434 p: {rec["body_hash"] for rec in target_trees[p].values()}
435 for p in added_paths
436 if p in target_trees and target_trees[p]
437 }
438
439 # Score all candidate pairs, then greedily assign best matches.
440 candidates: list[tuple[float, str, str]] = []
441 for old_path, old_h in base_hashes.items():
442 for new_path, new_h in target_hashes.items():
443 common = old_h & new_h
444 if not common:
445 continue
446 overlap = len(common) / min(len(old_h), len(new_h))
447 if overlap >= min_overlap:
448 candidates.append((overlap, old_path, new_path))
449
450 candidates.sort(key=lambda t: t[0], reverse=True)
451
452 moves: dict[str, str] = {}
453 used_old: set[str] = set()
454 used_new: set[str] = set()
455 for _, old_path, new_path in candidates:
456 if old_path in used_old or new_path in used_new:
457 continue
458 moves[old_path] = new_path
459 used_old.add(old_path)
460 used_new.add(new_path)
461
462 return moves
463
464
465 def _annotate_cross_file_moves(ops: list[DomainOp]) -> None:
466 """Annotate DeleteOp/InsertOp pairs that represent cross-file symbol moves.
467
468 Mutates the ``content_summary`` of matching ops in place. A move is
469 detected when:
470
471 - A ``DeleteOp`` child op (inside a ``PatchOp``) has the same
472 ``content_id`` as an ``InsertOp`` child op in a *different* file's
473 ``PatchOp``.
474
475 This is a best-effort annotation pass — it does not change the semantic
476 meaning of the ops, only their human-readable summaries.
477 """
478 # Collect child op references: content_id → (file_path, op_index_in_patch)
479 # We need mutable access so work with lists rather than immutable tuples.
480 delete_by_content: dict[str, tuple[str, int, list[DomainOp]]] = {}
481 insert_by_content: dict[str, tuple[str, int, list[DomainOp]]] = {}
482
483 for op in ops:
484 if op["op"] != "patch":
485 continue
486 file_path = op["address"]
487 for i, child in enumerate(op["child_ops"]):
488 if child["op"] == "delete":
489 delete_by_content[child["content_id"]] = (file_path, i, op["child_ops"])
490 elif child["op"] == "insert":
491 insert_by_content[child["content_id"]] = (file_path, i, op["child_ops"])
492
493 for content_id, (del_file, del_idx, del_children) in delete_by_content.items():
494 if content_id not in insert_by_content:
495 continue
496 ins_file, ins_idx, ins_children = insert_by_content[content_id]
497 if del_file == ins_file:
498 continue # Same file — not a cross-file move.
499
500 del_op = del_children[del_idx]
501 ins_op = ins_children[ins_idx]
502 # Narrow to the expected op kinds before accessing kind-specific fields.
503 if del_op["op"] != "delete" or ins_op["op"] != "insert":
504 continue
505 # Annotate both sides with move direction.
506 del_children[del_idx] = DeleteOp(
507 op="delete",
508 address=del_op["address"],
509 position=del_op["position"],
510 content_id=del_op["content_id"],
511 content_summary=f"{del_op['content_summary']} → moved to {ins_file}",
512 )
513 ins_children[ins_idx] = InsertOp(
514 op="insert",
515 address=ins_op["address"],
516 position=ins_op["position"],
517 content_id=ins_op["content_id"],
518 content_summary=f"{ins_op['content_summary']} ← moved from {del_file}",
519 )
520
521
522 # ---------------------------------------------------------------------------
523 # Summary helpers
524 # ---------------------------------------------------------------------------
525
526
527 def delta_summary(ops: list[DomainOp]) -> str:
528 """Produce a human-readable one-line summary of a list of ops.
529
530 Counts file-level and symbol-level operations separately.
531
532 Args:
533 ops: Top-level op list from :func:`build_diff_ops`.
534
535 Returns:
536 A concise summary string (e.g. ``"2 files modified (5 symbols)"``)
537 or ``"no changes"`` for an empty list.
538 """
539 files_added = sum(1 for o in ops if o["op"] == "insert" and "::" not in o["address"])
540 files_removed = sum(1 for o in ops if o["op"] == "delete" and "::" not in o["address"])
541 files_modified = sum(1 for o in ops if o["op"] in ("replace", "patch") and "::" not in o["address"])
542
543 # Count child-level symbol ops.
544 symbols_added = 0
545 symbols_removed = 0
546 symbols_modified = 0
547 for op in ops:
548 if op["op"] == "patch":
549 for child in op["child_ops"]:
550 if child["op"] == "insert":
551 symbols_added += 1
552 elif child["op"] == "delete":
553 symbols_removed += 1
554 elif child["op"] in ("replace", "move"):
555 symbols_modified += 1
556 elif op["op"] == "replace" and "::" not in op["address"]:
557 # File-level replace with no symbol breakdown.
558 pass
559
560 parts: list[str] = []
561 file_parts: list[str] = []
562 if files_added:
563 file_parts.append(f"{files_added} added")
564 if files_removed:
565 file_parts.append(f"{files_removed} removed")
566 if files_modified:
567 file_parts.append(f"{files_modified} modified")
568 if file_parts:
569 parts.append(f"{', '.join(file_parts)} file{'s' if sum([files_added, files_removed, files_modified]) != 1 else ''}")
570
571 sym_parts: list[str] = []
572 if symbols_added:
573 sym_parts.append(f"{symbols_added} added")
574 if symbols_removed:
575 sym_parts.append(f"{symbols_removed} removed")
576 if symbols_modified:
577 sym_parts.append(f"{symbols_modified} modified")
578 if sym_parts:
579 parts.append(f"{', '.join(sym_parts)} symbol{'s' if sum([symbols_added, symbols_removed, symbols_modified]) != 1 else ''}")
580
581 return ", ".join(parts) if parts else "no changes"