cgcardona / muse public
symbol_diff.py python
475 lines 18.8 KB
062ae392 feat: code-domain semantic commands + code tour de force demo (#54) Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """Symbol-level diff engine for the code domain plugin.
2
3 Produces typed :class:`~muse.domain.DomainOp` entries at symbol granularity
4 rather than file granularity. This allows Muse to report which *functions*
5 were added, removed, renamed, or modified — not just which *files* changed.
6
7 Operation types produced
8 ------------------------
9 ``InsertOp``
10 A symbol was added to the file (new function, class, etc.).
11
12 ``DeleteOp``
13 A symbol was removed from the file.
14
15 ``ReplaceOp``
16 A symbol's content changed. The ``old_summary`` / ``new_summary`` fields
17 describe the nature of the change:
18
19 - ``"renamed to <name>"`` — same body, different name (rename detected
20 via matching ``body_hash``).
21 - ``"signature changed"`` — same body, different signature.
22 - ``"implementation changed"`` — same signature, different body.
23 - ``"modified"`` — both signature and body changed.
24
25 ``MoveOp``
26 Reserved for intra-file positional moves (used when a symbol's address
27 is unchanged but it is detected elsewhere via ``content_id``).
28
29 Cross-file move detection
30 -------------------------
31 When a symbol disappears from one file and appears in another with an
32 identical ``content_id``, the diff engine annotates the ``DeleteOp`` and
33 ``InsertOp`` ``content_summary`` fields to indicate the move direction. No
34 special op type is introduced — the existing :class:`~muse.domain.InsertOp`
35 and :class:`~muse.domain.DeleteOp` suffice because their addresses already
36 encode the file path.
37
38 Algorithm
39 ---------
40 1. Partition symbol addresses into ``added``, ``removed``, and ``common``.
41 2. Build ``body_hash → address`` reverse maps for added and removed sets.
42 3. For each ``removed`` symbol:
43
44 a. If ``content_id`` matches an ``added`` symbol → **exact move/copy**
45 (same name, same body, different file).
46 b. Else if ``body_hash`` matches an ``added`` symbol → **rename**
47 (same body, different name).
48
49 4. Emit ``ReplaceOp`` for renames; pair the cross-file move partners via
50 ``content_summary``.
51 5. Emit ``DeleteOp`` for genuinely removed symbols.
52 6. Emit ``InsertOp`` for genuinely added symbols.
53 7. Emit ``ReplaceOp`` for symbols whose ``content_id`` changed.
54 """
55 from __future__ import annotations
56
57 import logging
58
59 from muse.domain import DeleteOp, DomainOp, InsertOp, PatchOp, ReplaceOp
60 from muse.plugins.code.ast_parser import SymbolRecord, SymbolTree
61
62 logger = logging.getLogger(__name__)
63
64 _CHILD_DOMAIN = "code_symbols"
65
66
67 # ---------------------------------------------------------------------------
68 # Symbol-level diff within a single file
69 # ---------------------------------------------------------------------------
70
71
72 def diff_symbol_trees(
73 base: SymbolTree,
74 target: SymbolTree,
75 ) -> list[DomainOp]:
76 """Compute symbol-level ops transforming *base* into *target*.
77
78 Both trees must be scoped to the same file (their addresses share the
79 same ``"<file_path>::"`` prefix).
80
81 Args:
82 base: Symbol tree of the base (older) version of the file.
83 target: Symbol tree of the target (newer) version of the file.
84
85 Returns:
86 Ordered list of :class:`~muse.domain.DomainOp` entries.
87 """
88 base_addrs = set(base)
89 target_addrs = set(target)
90 added: set[str] = target_addrs - base_addrs
91 removed: set[str] = base_addrs - target_addrs
92 common: set[str] = base_addrs & target_addrs
93
94 # Reverse maps for rename / move detection.
95 added_by_content: dict[str, str] = {
96 target[a]["content_id"]: a for a in added
97 }
98 added_by_body: dict[str, str] = {
99 target[a]["body_hash"]: a for a in added
100 }
101
102 ops: list[DomainOp] = []
103 # Addresses claimed by rename / move detection — excluded from plain ops.
104 matched_removed: set[str] = set()
105 matched_added: set[str] = set()
106
107 # ── Pass 1: renames (same body_hash, different name) ──────────────────
108 for rem_addr in sorted(removed):
109 base_rec = base[rem_addr]
110
111 if base_rec["content_id"] in added_by_content:
112 # Exact content match at a different address → same symbol moved
113 # (intra-file positional moves don't produce a different address;
114 # this catches cross-file moves surfaced within a single-file diff
115 # when the caller slices the tree incorrectly — uncommon).
116 tgt_addr = added_by_content[base_rec["content_id"]]
117 tgt_rec = target[tgt_addr]
118 ops.append(ReplaceOp(
119 op="replace",
120 address=rem_addr,
121 position=None,
122 old_content_id=base_rec["content_id"],
123 new_content_id=tgt_rec["content_id"],
124 old_summary=f"{base_rec['kind']} {base_rec['name']}",
125 new_summary=f"moved to {tgt_rec['qualified_name']}",
126 ))
127 matched_removed.add(rem_addr)
128 matched_added.add(tgt_addr)
129
130 elif (
131 base_rec["body_hash"] in added_by_body
132 and added_by_body[base_rec["body_hash"]] not in matched_added
133 ):
134 # Same body, different name → rename.
135 tgt_addr = added_by_body[base_rec["body_hash"]]
136 tgt_rec = target[tgt_addr]
137 ops.append(ReplaceOp(
138 op="replace",
139 address=rem_addr,
140 position=None,
141 old_content_id=base_rec["content_id"],
142 new_content_id=tgt_rec["content_id"],
143 old_summary=f"{base_rec['kind']} {base_rec['name']}",
144 new_summary=f"renamed to {tgt_rec['name']}",
145 ))
146 matched_removed.add(rem_addr)
147 matched_added.add(tgt_addr)
148
149 # ── Pass 2: plain deletions ────────────────────────────────────────────
150 for rem_addr in sorted(removed - matched_removed):
151 rec = base[rem_addr]
152 ops.append(DeleteOp(
153 op="delete",
154 address=rem_addr,
155 position=None,
156 content_id=rec["content_id"],
157 content_summary=f"removed {rec['kind']} {rec['name']}",
158 ))
159
160 # ── Pass 3: plain additions ────────────────────────────────────────────
161 for add_addr in sorted(added - matched_added):
162 rec = target[add_addr]
163 ops.append(InsertOp(
164 op="insert",
165 address=add_addr,
166 position=None,
167 content_id=rec["content_id"],
168 content_summary=f"added {rec['kind']} {rec['name']}",
169 ))
170
171 # ── Pass 4: modifications ──────────────────────────────────────────────
172 for addr in sorted(common):
173 base_rec = base[addr]
174 tgt_rec = target[addr]
175 if base_rec["content_id"] == tgt_rec["content_id"]:
176 continue # unchanged
177
178 if base_rec["body_hash"] == tgt_rec["body_hash"]:
179 # Body unchanged — signature changed (type annotations, defaults…).
180 old_summary = f"{base_rec['kind']} {base_rec['name']} (signature changed)"
181 new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (signature updated)"
182 elif base_rec["signature_id"] == tgt_rec["signature_id"]:
183 # Signature unchanged — implementation changed.
184 old_summary = f"{base_rec['kind']} {base_rec['name']} (implementation)"
185 new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (implementation changed)"
186 else:
187 # Both signature and body changed.
188 old_summary = f"{base_rec['kind']} {base_rec['name']}"
189 new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (modified)"
190
191 ops.append(ReplaceOp(
192 op="replace",
193 address=addr,
194 position=None,
195 old_content_id=base_rec["content_id"],
196 new_content_id=tgt_rec["content_id"],
197 old_summary=old_summary,
198 new_summary=new_summary,
199 ))
200
201 return ops
202
203
204 # ---------------------------------------------------------------------------
205 # Cross-file diff: build the full op list for a snapshot pair
206 # ---------------------------------------------------------------------------
207
208
209 def build_diff_ops(
210 base_files: dict[str, str],
211 target_files: dict[str, str],
212 base_trees: dict[str, SymbolTree],
213 target_trees: dict[str, SymbolTree],
214 ) -> list[DomainOp]:
215 """Build the complete op list transforming *base* snapshot into *target*.
216
217 For each changed file:
218
219 - **No symbol trees available**: coarse ``InsertOp`` / ``DeleteOp`` /
220 ``ReplaceOp`` at file level.
221 - **Symbol trees available for both sides**: ``PatchOp`` with symbol-level
222 ``child_ops``. If all symbols are unchanged (formatting-only change)
223 a ``ReplaceOp`` with ``"reformatted"`` summary is emitted instead.
224 - **Symbol tree available for one side only** (new or deleted file):
225 ``PatchOp`` listing each symbol individually.
226
227 Cross-file move annotation
228 --------------------------
229 After building per-file ops, a second pass checks whether any symbol
230 ``content_id`` appears in both a ``DeleteOp`` child op and an ``InsertOp``
231 child op across *different* files. When found, both ops' ``content_summary``
232 fields are annotated with the move direction.
233
234 Args:
235 base_files: ``{path: raw_bytes_hash}`` from the base snapshot.
236 target_files: ``{path: raw_bytes_hash}`` from the target snapshot.
237 base_trees: Symbol trees for changed base files, keyed by path.
238 target_trees: Symbol trees for changed target files, keyed by path.
239
240 Returns:
241 Ordered list of ``DomainOp`` entries.
242 """
243 base_paths = set(base_files)
244 target_paths = set(target_files)
245 added_paths = sorted(target_paths - base_paths)
246 removed_paths = sorted(base_paths - target_paths)
247 modified_paths = sorted(
248 p for p in base_paths & target_paths
249 if base_files[p] != target_files[p]
250 )
251
252 ops: list[DomainOp] = []
253
254 # ── Added files ────────────────────────────────────────────────────────
255 for path in added_paths:
256 tree = target_trees.get(path, {})
257 if tree:
258 child_ops: list[DomainOp] = [
259 InsertOp(
260 op="insert",
261 address=addr,
262 position=None,
263 content_id=rec["content_id"],
264 content_summary=f"added {rec['kind']} {rec['name']}",
265 )
266 for addr, rec in sorted(tree.items())
267 ]
268 ops.append(_patch(path, child_ops))
269 else:
270 ops.append(InsertOp(
271 op="insert",
272 address=path,
273 position=None,
274 content_id=target_files[path],
275 content_summary=f"added {path}",
276 ))
277
278 # ── Removed files ──────────────────────────────────────────────────────
279 for path in removed_paths:
280 tree = base_trees.get(path, {})
281 if tree:
282 child_ops = [
283 DeleteOp(
284 op="delete",
285 address=addr,
286 position=None,
287 content_id=rec["content_id"],
288 content_summary=f"removed {rec['kind']} {rec['name']}",
289 )
290 for addr, rec in sorted(tree.items())
291 ]
292 ops.append(_patch(path, child_ops))
293 else:
294 ops.append(DeleteOp(
295 op="delete",
296 address=path,
297 position=None,
298 content_id=base_files[path],
299 content_summary=f"removed {path}",
300 ))
301
302 # ── Modified files ─────────────────────────────────────────────────────
303 for path in modified_paths:
304 base_tree = base_trees.get(path, {})
305 target_tree = target_trees.get(path, {})
306
307 if base_tree or target_tree:
308 child_ops = diff_symbol_trees(base_tree, target_tree)
309 if child_ops:
310 ops.append(_patch(path, child_ops))
311 else:
312 # All symbols have the same content_id — formatting-only change.
313 ops.append(ReplaceOp(
314 op="replace",
315 address=path,
316 position=None,
317 old_content_id=base_files[path],
318 new_content_id=target_files[path],
319 old_summary=f"{path} (before)",
320 new_summary=f"{path} (reformatted — no semantic change)",
321 ))
322 else:
323 ops.append(ReplaceOp(
324 op="replace",
325 address=path,
326 position=None,
327 old_content_id=base_files[path],
328 new_content_id=target_files[path],
329 old_summary=f"{path} (before)",
330 new_summary=f"{path} (after)",
331 ))
332
333 _annotate_cross_file_moves(ops)
334 return ops
335
336
337 def _patch(path: str, child_ops: list[DomainOp]) -> PatchOp:
338 """Wrap symbol child_ops in a file-level PatchOp."""
339 n_added = sum(1 for o in child_ops if o["op"] == "insert")
340 n_removed = sum(1 for o in child_ops if o["op"] == "delete")
341 n_modified = sum(1 for o in child_ops if o["op"] == "replace")
342 parts: list[str] = []
343 if n_added:
344 parts.append(f"{n_added} symbol{'s' if n_added > 1 else ''} added")
345 if n_removed:
346 parts.append(f"{n_removed} symbol{'s' if n_removed > 1 else ''} removed")
347 if n_modified:
348 parts.append(f"{n_modified} symbol{'s' if n_modified > 1 else ''} modified")
349 summary = ", ".join(parts) if parts else "no symbol changes"
350 return PatchOp(
351 op="patch",
352 address=path,
353 child_ops=child_ops,
354 child_domain=_CHILD_DOMAIN,
355 child_summary=summary,
356 )
357
358
359 def _annotate_cross_file_moves(ops: list[DomainOp]) -> None:
360 """Annotate DeleteOp/InsertOp pairs that represent cross-file symbol moves.
361
362 Mutates the ``content_summary`` of matching ops in place. A move is
363 detected when:
364
365 - A ``DeleteOp`` child op (inside a ``PatchOp``) has the same
366 ``content_id`` as an ``InsertOp`` child op in a *different* file's
367 ``PatchOp``.
368
369 This is a best-effort annotation pass — it does not change the semantic
370 meaning of the ops, only their human-readable summaries.
371 """
372 # Collect child op references: content_id → (file_path, op_index_in_patch)
373 # We need mutable access so work with lists rather than immutable tuples.
374 delete_by_content: dict[str, tuple[str, int, list[DomainOp]]] = {}
375 insert_by_content: dict[str, tuple[str, int, list[DomainOp]]] = {}
376
377 for op in ops:
378 if op["op"] != "patch":
379 continue
380 file_path = op["address"]
381 for i, child in enumerate(op["child_ops"]):
382 if child["op"] == "delete":
383 delete_by_content[child["content_id"]] = (file_path, i, op["child_ops"])
384 elif child["op"] == "insert":
385 insert_by_content[child["content_id"]] = (file_path, i, op["child_ops"])
386
387 for content_id, (del_file, del_idx, del_children) in delete_by_content.items():
388 if content_id not in insert_by_content:
389 continue
390 ins_file, ins_idx, ins_children = insert_by_content[content_id]
391 if del_file == ins_file:
392 continue # Same file — not a cross-file move.
393
394 del_op = del_children[del_idx]
395 ins_op = ins_children[ins_idx]
396 # Narrow to the expected op kinds before accessing kind-specific fields.
397 if del_op["op"] != "delete" or ins_op["op"] != "insert":
398 continue
399 # Annotate both sides with move direction.
400 del_children[del_idx] = DeleteOp(
401 op="delete",
402 address=del_op["address"],
403 position=del_op["position"],
404 content_id=del_op["content_id"],
405 content_summary=f"{del_op['content_summary']} → moved to {ins_file}",
406 )
407 ins_children[ins_idx] = InsertOp(
408 op="insert",
409 address=ins_op["address"],
410 position=ins_op["position"],
411 content_id=ins_op["content_id"],
412 content_summary=f"{ins_op['content_summary']} ← moved from {del_file}",
413 )
414
415
416 # ---------------------------------------------------------------------------
417 # Summary helpers
418 # ---------------------------------------------------------------------------
419
420
421 def delta_summary(ops: list[DomainOp]) -> str:
422 """Produce a human-readable one-line summary of a list of ops.
423
424 Counts file-level and symbol-level operations separately.
425
426 Args:
427 ops: Top-level op list from :func:`build_diff_ops`.
428
429 Returns:
430 A concise summary string (e.g. ``"2 files modified (5 symbols)"``)
431 or ``"no changes"`` for an empty list.
432 """
433 files_added = sum(1 for o in ops if o["op"] == "insert" and "::" not in o["address"])
434 files_removed = sum(1 for o in ops if o["op"] == "delete" and "::" not in o["address"])
435 files_modified = sum(1 for o in ops if o["op"] in ("replace", "patch") and "::" not in o["address"])
436
437 # Count child-level symbol ops.
438 symbols_added = 0
439 symbols_removed = 0
440 symbols_modified = 0
441 for op in ops:
442 if op["op"] == "patch":
443 for child in op["child_ops"]:
444 if child["op"] == "insert":
445 symbols_added += 1
446 elif child["op"] == "delete":
447 symbols_removed += 1
448 elif child["op"] in ("replace", "move"):
449 symbols_modified += 1
450 elif op["op"] == "replace" and "::" not in op["address"]:
451 # File-level replace with no symbol breakdown.
452 pass
453
454 parts: list[str] = []
455 file_parts: list[str] = []
456 if files_added:
457 file_parts.append(f"{files_added} added")
458 if files_removed:
459 file_parts.append(f"{files_removed} removed")
460 if files_modified:
461 file_parts.append(f"{files_modified} modified")
462 if file_parts:
463 parts.append(f"{', '.join(file_parts)} file{'s' if sum([files_added, files_removed, files_modified]) != 1 else ''}")
464
465 sym_parts: list[str] = []
466 if symbols_added:
467 sym_parts.append(f"{symbols_added} added")
468 if symbols_removed:
469 sym_parts.append(f"{symbols_removed} removed")
470 if symbols_modified:
471 sym_parts.append(f"{symbols_modified} modified")
472 if sym_parts:
473 parts.append(f"{', '.join(sym_parts)} symbol{'s' if sum([symbols_added, symbols_removed, symbols_modified]) != 1 else ''}")
474
475 return ", ".join(parts) if parts else "no changes"