symbol_diff.py
python
| 1 | """Symbol-level diff engine for the code domain plugin. |
| 2 | |
| 3 | Produces typed :class:`~muse.domain.DomainOp` entries at symbol granularity |
| 4 | rather than file granularity. This allows Muse to report which *functions* |
| 5 | were added, removed, renamed, or modified — not just which *files* changed. |
| 6 | |
| 7 | Operation types produced |
| 8 | ------------------------ |
| 9 | ``InsertOp`` |
| 10 | A symbol was added to the file (new function, class, etc.). |
| 11 | |
| 12 | ``DeleteOp`` |
| 13 | A symbol was removed from the file. |
| 14 | |
| 15 | ``ReplaceOp`` |
| 16 | A symbol's content changed. The ``old_summary`` / ``new_summary`` fields |
| 17 | describe the nature of the change: |
| 18 | |
| 19 | - ``"renamed to <name>"`` — same body, different name (rename detected |
| 20 | via matching ``body_hash``). |
| 21 | - ``"signature changed"`` — same body, different signature. |
| 22 | - ``"implementation changed"`` — same signature, different body. |
| 23 | - ``"modified"`` — both signature and body changed. |
| 24 | |
| 25 | ``MoveOp`` |
| 26 | Reserved for intra-file positional moves (used when a symbol's address |
| 27 | is unchanged but it is detected elsewhere via ``content_id``). |
| 28 | |
| 29 | Cross-file move detection |
| 30 | ------------------------- |
| 31 | When a symbol disappears from one file and appears in another with an |
| 32 | identical ``content_id``, the diff engine annotates the ``DeleteOp`` and |
| 33 | ``InsertOp`` ``content_summary`` fields to indicate the move direction. No |
| 34 | special op type is introduced — the existing :class:`~muse.domain.InsertOp` |
| 35 | and :class:`~muse.domain.DeleteOp` suffice because their addresses already |
| 36 | encode the file path. |
| 37 | |
| 38 | Algorithm |
| 39 | --------- |
| 40 | 1. Partition symbol addresses into ``added``, ``removed``, and ``common``. |
| 41 | 2. Build ``body_hash → address`` reverse maps for added and removed sets. |
| 42 | 3. For each ``removed`` symbol: |
| 43 | |
| 44 | a. If ``content_id`` matches an ``added`` symbol → **exact move/copy** |
| 45 | (same name, same body, different file). |
| 46 | b. Else if ``body_hash`` matches an ``added`` symbol → **rename** |
| 47 | (same body, different name). |
| 48 | |
| 49 | 4. Emit ``ReplaceOp`` for renames; pair the cross-file move partners via |
| 50 | ``content_summary``. |
| 51 | 5. Emit ``DeleteOp`` for genuinely removed symbols. |
| 52 | 6. Emit ``InsertOp`` for genuinely added symbols. |
| 53 | 7. Emit ``ReplaceOp`` for symbols whose ``content_id`` changed. |
| 54 | """ |
| 55 | from __future__ import annotations |
| 56 | |
| 57 | import logging |
| 58 | |
| 59 | from muse.domain import DeleteOp, DomainOp, InsertOp, PatchOp, ReplaceOp |
| 60 | from muse.plugins.code.ast_parser import SymbolRecord, SymbolTree |
| 61 | |
| 62 | logger = logging.getLogger(__name__) |
| 63 | |
| 64 | _CHILD_DOMAIN = "code_symbols" |
| 65 | |
| 66 | |
| 67 | # --------------------------------------------------------------------------- |
| 68 | # Symbol-level diff within a single file |
| 69 | # --------------------------------------------------------------------------- |
| 70 | |
| 71 | |
| 72 | def diff_symbol_trees( |
| 73 | base: SymbolTree, |
| 74 | target: SymbolTree, |
| 75 | ) -> list[DomainOp]: |
| 76 | """Compute symbol-level ops transforming *base* into *target*. |
| 77 | |
| 78 | Both trees must be scoped to the same file (their addresses share the |
| 79 | same ``"<file_path>::"`` prefix). |
| 80 | |
| 81 | Args: |
| 82 | base: Symbol tree of the base (older) version of the file. |
| 83 | target: Symbol tree of the target (newer) version of the file. |
| 84 | |
| 85 | Returns: |
| 86 | Ordered list of :class:`~muse.domain.DomainOp` entries. |
| 87 | """ |
| 88 | base_addrs = set(base) |
| 89 | target_addrs = set(target) |
| 90 | added: set[str] = target_addrs - base_addrs |
| 91 | removed: set[str] = base_addrs - target_addrs |
| 92 | common: set[str] = base_addrs & target_addrs |
| 93 | |
| 94 | # Reverse maps for rename / move detection. |
| 95 | added_by_content: dict[str, str] = { |
| 96 | target[a]["content_id"]: a for a in added |
| 97 | } |
| 98 | added_by_body: dict[str, str] = { |
| 99 | target[a]["body_hash"]: a for a in added |
| 100 | } |
| 101 | |
| 102 | ops: list[DomainOp] = [] |
| 103 | # Addresses claimed by rename / move detection — excluded from plain ops. |
| 104 | matched_removed: set[str] = set() |
| 105 | matched_added: set[str] = set() |
| 106 | |
| 107 | # ── Pass 1: renames (same body_hash, different name) ────────────────── |
| 108 | for rem_addr in sorted(removed): |
| 109 | base_rec = base[rem_addr] |
| 110 | |
| 111 | if base_rec["content_id"] in added_by_content: |
| 112 | # Exact content match at a different address → same symbol moved |
| 113 | # (intra-file positional moves don't produce a different address; |
| 114 | # this catches cross-file moves surfaced within a single-file diff |
| 115 | # when the caller slices the tree incorrectly — uncommon). |
| 116 | tgt_addr = added_by_content[base_rec["content_id"]] |
| 117 | tgt_rec = target[tgt_addr] |
| 118 | ops.append(ReplaceOp( |
| 119 | op="replace", |
| 120 | address=rem_addr, |
| 121 | position=None, |
| 122 | old_content_id=base_rec["content_id"], |
| 123 | new_content_id=tgt_rec["content_id"], |
| 124 | old_summary=f"{base_rec['kind']} {base_rec['name']}", |
| 125 | new_summary=f"moved to {tgt_rec['qualified_name']}", |
| 126 | )) |
| 127 | matched_removed.add(rem_addr) |
| 128 | matched_added.add(tgt_addr) |
| 129 | |
| 130 | elif ( |
| 131 | base_rec["body_hash"] in added_by_body |
| 132 | and added_by_body[base_rec["body_hash"]] not in matched_added |
| 133 | ): |
| 134 | # Same body, different name → rename. |
| 135 | tgt_addr = added_by_body[base_rec["body_hash"]] |
| 136 | tgt_rec = target[tgt_addr] |
| 137 | ops.append(ReplaceOp( |
| 138 | op="replace", |
| 139 | address=rem_addr, |
| 140 | position=None, |
| 141 | old_content_id=base_rec["content_id"], |
| 142 | new_content_id=tgt_rec["content_id"], |
| 143 | old_summary=f"{base_rec['kind']} {base_rec['name']}", |
| 144 | new_summary=f"renamed to {tgt_rec['name']}", |
| 145 | )) |
| 146 | matched_removed.add(rem_addr) |
| 147 | matched_added.add(tgt_addr) |
| 148 | |
| 149 | # ── Pass 2: plain deletions ──────────────────────────────────────────── |
| 150 | for rem_addr in sorted(removed - matched_removed): |
| 151 | rec = base[rem_addr] |
| 152 | ops.append(DeleteOp( |
| 153 | op="delete", |
| 154 | address=rem_addr, |
| 155 | position=None, |
| 156 | content_id=rec["content_id"], |
| 157 | content_summary=f"removed {rec['kind']} {rec['name']}", |
| 158 | )) |
| 159 | |
| 160 | # ── Pass 3: plain additions ──────────────────────────────────────────── |
| 161 | for add_addr in sorted(added - matched_added): |
| 162 | rec = target[add_addr] |
| 163 | ops.append(InsertOp( |
| 164 | op="insert", |
| 165 | address=add_addr, |
| 166 | position=None, |
| 167 | content_id=rec["content_id"], |
| 168 | content_summary=f"added {rec['kind']} {rec['name']}", |
| 169 | )) |
| 170 | |
| 171 | # ── Pass 4: modifications ────────────────────────────────────────────── |
| 172 | for addr in sorted(common): |
| 173 | base_rec = base[addr] |
| 174 | tgt_rec = target[addr] |
| 175 | if base_rec["content_id"] == tgt_rec["content_id"]: |
| 176 | continue # unchanged |
| 177 | |
| 178 | if base_rec["body_hash"] == tgt_rec["body_hash"]: |
| 179 | # Body unchanged — signature changed (type annotations, defaults…). |
| 180 | old_summary = f"{base_rec['kind']} {base_rec['name']} (signature changed)" |
| 181 | new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (signature updated)" |
| 182 | elif base_rec["signature_id"] == tgt_rec["signature_id"]: |
| 183 | # Signature unchanged — implementation changed. |
| 184 | old_summary = f"{base_rec['kind']} {base_rec['name']} (implementation)" |
| 185 | new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (implementation changed)" |
| 186 | else: |
| 187 | # Both signature and body changed. |
| 188 | old_summary = f"{base_rec['kind']} {base_rec['name']}" |
| 189 | new_summary = f"{tgt_rec['kind']} {tgt_rec['name']} (modified)" |
| 190 | |
| 191 | ops.append(ReplaceOp( |
| 192 | op="replace", |
| 193 | address=addr, |
| 194 | position=None, |
| 195 | old_content_id=base_rec["content_id"], |
| 196 | new_content_id=tgt_rec["content_id"], |
| 197 | old_summary=old_summary, |
| 198 | new_summary=new_summary, |
| 199 | )) |
| 200 | |
| 201 | return ops |
| 202 | |
| 203 | |
| 204 | # --------------------------------------------------------------------------- |
| 205 | # Cross-file diff: build the full op list for a snapshot pair |
| 206 | # --------------------------------------------------------------------------- |
| 207 | |
| 208 | |
| 209 | def build_diff_ops( |
| 210 | base_files: dict[str, str], |
| 211 | target_files: dict[str, str], |
| 212 | base_trees: dict[str, SymbolTree], |
| 213 | target_trees: dict[str, SymbolTree], |
| 214 | ) -> list[DomainOp]: |
| 215 | """Build the complete op list transforming *base* snapshot into *target*. |
| 216 | |
| 217 | For each changed file: |
| 218 | |
| 219 | - **No symbol trees available**: coarse ``InsertOp`` / ``DeleteOp`` / |
| 220 | ``ReplaceOp`` at file level. |
| 221 | - **Symbol trees available for both sides**: ``PatchOp`` with symbol-level |
| 222 | ``child_ops``. If all symbols are unchanged (formatting-only change) |
| 223 | a ``ReplaceOp`` with ``"reformatted"`` summary is emitted instead. |
| 224 | - **Symbol tree available for one side only** (new or deleted file): |
| 225 | ``PatchOp`` listing each symbol individually. |
| 226 | |
| 227 | Cross-file move annotation |
| 228 | -------------------------- |
| 229 | After building per-file ops, a second pass checks whether any symbol |
| 230 | ``content_id`` appears in both a ``DeleteOp`` child op and an ``InsertOp`` |
| 231 | child op across *different* files. When found, both ops' ``content_summary`` |
| 232 | fields are annotated with the move direction. |
| 233 | |
| 234 | Args: |
| 235 | base_files: ``{path: raw_bytes_hash}`` from the base snapshot. |
| 236 | target_files: ``{path: raw_bytes_hash}`` from the target snapshot. |
| 237 | base_trees: Symbol trees for changed base files, keyed by path. |
| 238 | target_trees: Symbol trees for changed target files, keyed by path. |
| 239 | |
| 240 | Returns: |
| 241 | Ordered list of ``DomainOp`` entries. |
| 242 | """ |
| 243 | base_paths = set(base_files) |
| 244 | target_paths = set(target_files) |
| 245 | added_paths = sorted(target_paths - base_paths) |
| 246 | removed_paths = sorted(base_paths - target_paths) |
| 247 | modified_paths = sorted( |
| 248 | p for p in base_paths & target_paths |
| 249 | if base_files[p] != target_files[p] |
| 250 | ) |
| 251 | |
| 252 | ops: list[DomainOp] = [] |
| 253 | |
| 254 | # ── Added files ──────────────────────────────────────────────────────── |
| 255 | for path in added_paths: |
| 256 | tree = target_trees.get(path, {}) |
| 257 | if tree: |
| 258 | child_ops: list[DomainOp] = [ |
| 259 | InsertOp( |
| 260 | op="insert", |
| 261 | address=addr, |
| 262 | position=None, |
| 263 | content_id=rec["content_id"], |
| 264 | content_summary=f"added {rec['kind']} {rec['name']}", |
| 265 | ) |
| 266 | for addr, rec in sorted(tree.items()) |
| 267 | ] |
| 268 | ops.append(_patch(path, child_ops)) |
| 269 | else: |
| 270 | ops.append(InsertOp( |
| 271 | op="insert", |
| 272 | address=path, |
| 273 | position=None, |
| 274 | content_id=target_files[path], |
| 275 | content_summary=f"added {path}", |
| 276 | )) |
| 277 | |
| 278 | # ── Removed files ────────────────────────────────────────────────────── |
| 279 | for path in removed_paths: |
| 280 | tree = base_trees.get(path, {}) |
| 281 | if tree: |
| 282 | child_ops = [ |
| 283 | DeleteOp( |
| 284 | op="delete", |
| 285 | address=addr, |
| 286 | position=None, |
| 287 | content_id=rec["content_id"], |
| 288 | content_summary=f"removed {rec['kind']} {rec['name']}", |
| 289 | ) |
| 290 | for addr, rec in sorted(tree.items()) |
| 291 | ] |
| 292 | ops.append(_patch(path, child_ops)) |
| 293 | else: |
| 294 | ops.append(DeleteOp( |
| 295 | op="delete", |
| 296 | address=path, |
| 297 | position=None, |
| 298 | content_id=base_files[path], |
| 299 | content_summary=f"removed {path}", |
| 300 | )) |
| 301 | |
| 302 | # ── Modified files ───────────────────────────────────────────────────── |
| 303 | for path in modified_paths: |
| 304 | base_tree = base_trees.get(path, {}) |
| 305 | target_tree = target_trees.get(path, {}) |
| 306 | |
| 307 | if base_tree or target_tree: |
| 308 | child_ops = diff_symbol_trees(base_tree, target_tree) |
| 309 | if child_ops: |
| 310 | ops.append(_patch(path, child_ops)) |
| 311 | else: |
| 312 | # All symbols have the same content_id — formatting-only change. |
| 313 | ops.append(ReplaceOp( |
| 314 | op="replace", |
| 315 | address=path, |
| 316 | position=None, |
| 317 | old_content_id=base_files[path], |
| 318 | new_content_id=target_files[path], |
| 319 | old_summary=f"{path} (before)", |
| 320 | new_summary=f"{path} (reformatted — no semantic change)", |
| 321 | )) |
| 322 | else: |
| 323 | ops.append(ReplaceOp( |
| 324 | op="replace", |
| 325 | address=path, |
| 326 | position=None, |
| 327 | old_content_id=base_files[path], |
| 328 | new_content_id=target_files[path], |
| 329 | old_summary=f"{path} (before)", |
| 330 | new_summary=f"{path} (after)", |
| 331 | )) |
| 332 | |
| 333 | _annotate_cross_file_moves(ops) |
| 334 | return ops |
| 335 | |
| 336 | |
| 337 | def _patch(path: str, child_ops: list[DomainOp]) -> PatchOp: |
| 338 | """Wrap symbol child_ops in a file-level PatchOp.""" |
| 339 | n_added = sum(1 for o in child_ops if o["op"] == "insert") |
| 340 | n_removed = sum(1 for o in child_ops if o["op"] == "delete") |
| 341 | n_modified = sum(1 for o in child_ops if o["op"] == "replace") |
| 342 | parts: list[str] = [] |
| 343 | if n_added: |
| 344 | parts.append(f"{n_added} symbol{'s' if n_added > 1 else ''} added") |
| 345 | if n_removed: |
| 346 | parts.append(f"{n_removed} symbol{'s' if n_removed > 1 else ''} removed") |
| 347 | if n_modified: |
| 348 | parts.append(f"{n_modified} symbol{'s' if n_modified > 1 else ''} modified") |
| 349 | summary = ", ".join(parts) if parts else "no symbol changes" |
| 350 | return PatchOp( |
| 351 | op="patch", |
| 352 | address=path, |
| 353 | child_ops=child_ops, |
| 354 | child_domain=_CHILD_DOMAIN, |
| 355 | child_summary=summary, |
| 356 | ) |
| 357 | |
| 358 | |
| 359 | def _annotate_cross_file_moves(ops: list[DomainOp]) -> None: |
| 360 | """Annotate DeleteOp/InsertOp pairs that represent cross-file symbol moves. |
| 361 | |
| 362 | Mutates the ``content_summary`` of matching ops in place. A move is |
| 363 | detected when: |
| 364 | |
| 365 | - A ``DeleteOp`` child op (inside a ``PatchOp``) has the same |
| 366 | ``content_id`` as an ``InsertOp`` child op in a *different* file's |
| 367 | ``PatchOp``. |
| 368 | |
| 369 | This is a best-effort annotation pass — it does not change the semantic |
| 370 | meaning of the ops, only their human-readable summaries. |
| 371 | """ |
| 372 | # Collect child op references: content_id → (file_path, op_index_in_patch) |
| 373 | # We need mutable access so work with lists rather than immutable tuples. |
| 374 | delete_by_content: dict[str, tuple[str, int, list[DomainOp]]] = {} |
| 375 | insert_by_content: dict[str, tuple[str, int, list[DomainOp]]] = {} |
| 376 | |
| 377 | for op in ops: |
| 378 | if op["op"] != "patch": |
| 379 | continue |
| 380 | file_path = op["address"] |
| 381 | for i, child in enumerate(op["child_ops"]): |
| 382 | if child["op"] == "delete": |
| 383 | delete_by_content[child["content_id"]] = (file_path, i, op["child_ops"]) |
| 384 | elif child["op"] == "insert": |
| 385 | insert_by_content[child["content_id"]] = (file_path, i, op["child_ops"]) |
| 386 | |
| 387 | for content_id, (del_file, del_idx, del_children) in delete_by_content.items(): |
| 388 | if content_id not in insert_by_content: |
| 389 | continue |
| 390 | ins_file, ins_idx, ins_children = insert_by_content[content_id] |
| 391 | if del_file == ins_file: |
| 392 | continue # Same file — not a cross-file move. |
| 393 | |
| 394 | del_op = del_children[del_idx] |
| 395 | ins_op = ins_children[ins_idx] |
| 396 | # Narrow to the expected op kinds before accessing kind-specific fields. |
| 397 | if del_op["op"] != "delete" or ins_op["op"] != "insert": |
| 398 | continue |
| 399 | # Annotate both sides with move direction. |
| 400 | del_children[del_idx] = DeleteOp( |
| 401 | op="delete", |
| 402 | address=del_op["address"], |
| 403 | position=del_op["position"], |
| 404 | content_id=del_op["content_id"], |
| 405 | content_summary=f"{del_op['content_summary']} → moved to {ins_file}", |
| 406 | ) |
| 407 | ins_children[ins_idx] = InsertOp( |
| 408 | op="insert", |
| 409 | address=ins_op["address"], |
| 410 | position=ins_op["position"], |
| 411 | content_id=ins_op["content_id"], |
| 412 | content_summary=f"{ins_op['content_summary']} ← moved from {del_file}", |
| 413 | ) |
| 414 | |
| 415 | |
| 416 | # --------------------------------------------------------------------------- |
| 417 | # Summary helpers |
| 418 | # --------------------------------------------------------------------------- |
| 419 | |
| 420 | |
| 421 | def delta_summary(ops: list[DomainOp]) -> str: |
| 422 | """Produce a human-readable one-line summary of a list of ops. |
| 423 | |
| 424 | Counts file-level and symbol-level operations separately. |
| 425 | |
| 426 | Args: |
| 427 | ops: Top-level op list from :func:`build_diff_ops`. |
| 428 | |
| 429 | Returns: |
| 430 | A concise summary string (e.g. ``"2 files modified (5 symbols)"``) |
| 431 | or ``"no changes"`` for an empty list. |
| 432 | """ |
| 433 | files_added = sum(1 for o in ops if o["op"] == "insert" and "::" not in o["address"]) |
| 434 | files_removed = sum(1 for o in ops if o["op"] == "delete" and "::" not in o["address"]) |
| 435 | files_modified = sum(1 for o in ops if o["op"] in ("replace", "patch") and "::" not in o["address"]) |
| 436 | |
| 437 | # Count child-level symbol ops. |
| 438 | symbols_added = 0 |
| 439 | symbols_removed = 0 |
| 440 | symbols_modified = 0 |
| 441 | for op in ops: |
| 442 | if op["op"] == "patch": |
| 443 | for child in op["child_ops"]: |
| 444 | if child["op"] == "insert": |
| 445 | symbols_added += 1 |
| 446 | elif child["op"] == "delete": |
| 447 | symbols_removed += 1 |
| 448 | elif child["op"] in ("replace", "move"): |
| 449 | symbols_modified += 1 |
| 450 | elif op["op"] == "replace" and "::" not in op["address"]: |
| 451 | # File-level replace with no symbol breakdown. |
| 452 | pass |
| 453 | |
| 454 | parts: list[str] = [] |
| 455 | file_parts: list[str] = [] |
| 456 | if files_added: |
| 457 | file_parts.append(f"{files_added} added") |
| 458 | if files_removed: |
| 459 | file_parts.append(f"{files_removed} removed") |
| 460 | if files_modified: |
| 461 | file_parts.append(f"{files_modified} modified") |
| 462 | if file_parts: |
| 463 | parts.append(f"{', '.join(file_parts)} file{'s' if sum([files_added, files_removed, files_modified]) != 1 else ''}") |
| 464 | |
| 465 | sym_parts: list[str] = [] |
| 466 | if symbols_added: |
| 467 | sym_parts.append(f"{symbols_added} added") |
| 468 | if symbols_removed: |
| 469 | sym_parts.append(f"{symbols_removed} removed") |
| 470 | if symbols_modified: |
| 471 | sym_parts.append(f"{symbols_modified} modified") |
| 472 | if sym_parts: |
| 473 | parts.append(f"{', '.join(sym_parts)} symbol{'s' if sum([symbols_added, symbols_removed, symbols_modified]) != 1 else ''}") |
| 474 | |
| 475 | return ", ".join(parts) if parts else "no changes" |