clones.py
python
| 1 | """muse clones — find duplicate and near-duplicate symbols. |
| 2 | |
| 3 | Detects two tiers of code duplication from committed snapshot data: |
| 4 | |
| 5 | **Exact clones** |
| 6 | Symbols with the same ``body_hash`` at different addresses. The body is |
| 7 | character-for-character identical (after normalisation) even if the name or |
| 8 | surrounding context differs. These are true copy-paste duplicates. |
| 9 | |
| 10 | **Near-clones** |
| 11 | Symbols with the same ``signature_id`` but different ``body_hash``. Same |
| 12 | function signature, different implementation — strong candidates for |
| 13 | consolidation behind a shared abstraction. |
| 14 | |
| 15 | Git has no concept of these. Git stores file diffs; Muse stores symbol |
| 16 | identity hashes. Clone detection is a single pass over the snapshot index. |
| 17 | |
| 18 | Usage:: |
| 19 | |
| 20 | muse clones |
| 21 | muse clones --tier exact |
| 22 | muse clones --tier near |
| 23 | muse clones --kind function |
| 24 | muse clones --commit HEAD~10 |
| 25 | muse clones --min-cluster 3 |
| 26 | muse clones --json |
| 27 | |
| 28 | Output:: |
| 29 | |
| 30 | Clone analysis — commit a1b2c3d4 |
| 31 | ────────────────────────────────────────────────────────────── |
| 32 | |
| 33 | Exact clones (2 clusters): |
| 34 | body_hash a1b2c3d4: |
| 35 | src/billing.py::compute_hash function |
| 36 | src/utils.py::compute_hash function |
| 37 | src/legacy.py::_hash function |
| 38 | |
| 39 | Near-clones — same signature (3 clusters): |
| 40 | signature_id e5f6a7b8: |
| 41 | src/billing.py::validate function |
| 42 | src/auth.py::validate function |
| 43 | |
| 44 | Flags: |
| 45 | |
| 46 | ``--tier {exact|near|both}`` |
| 47 | Which tier to report (default: both). |
| 48 | |
| 49 | ``--kind KIND`` |
| 50 | Restrict to symbols of this kind. |
| 51 | |
| 52 | ``--min-cluster N`` |
| 53 | Only show clusters with at least N members (default: 2). |
| 54 | |
| 55 | ``--commit, -c REF`` |
| 56 | Analyse a historical snapshot instead of HEAD. |
| 57 | |
| 58 | ``--json`` |
| 59 | Emit results as JSON. |
| 60 | """ |
| 61 | from __future__ import annotations |
| 62 | |
| 63 | import json |
| 64 | import logging |
| 65 | import pathlib |
| 66 | from typing import Literal |
| 67 | |
| 68 | import typer |
| 69 | |
| 70 | from muse.core.errors import ExitCode |
| 71 | from muse.core.repo import require_repo |
| 72 | from muse.core.store import get_commit_snapshot_manifest, resolve_commit_ref |
| 73 | from muse.plugins.code._query import language_of, symbols_for_snapshot |
| 74 | from muse.plugins.code.ast_parser import SymbolRecord |
| 75 | |
| 76 | logger = logging.getLogger(__name__) |
| 77 | |
| 78 | app = typer.Typer() |
| 79 | |
| 80 | CloneTier = Literal["exact", "near", "both"] |
| 81 | |
| 82 | |
| 83 | def _read_repo_id(root: pathlib.Path) -> str: |
| 84 | return str(json.loads((root / ".muse" / "repo.json").read_text())["repo_id"]) |
| 85 | |
| 86 | |
| 87 | def _read_branch(root: pathlib.Path) -> str: |
| 88 | head_ref = (root / ".muse" / "HEAD").read_text().strip() |
| 89 | return head_ref.removeprefix("refs/heads/").strip() |
| 90 | |
| 91 | |
| 92 | class _CloneCluster: |
| 93 | def __init__( |
| 94 | self, |
| 95 | tier: CloneTier, |
| 96 | hash_value: str, |
| 97 | members: list[tuple[str, SymbolRecord]], |
| 98 | ) -> None: |
| 99 | self.tier = tier |
| 100 | self.hash_value = hash_value |
| 101 | self.members = members # (address, record) |
| 102 | |
| 103 | def to_dict(self) -> dict[str, str | list[dict[str, str]]]: |
| 104 | return { |
| 105 | "tier": self.tier, |
| 106 | "hash": self.hash_value[:8], |
| 107 | "count": str(len(self.members)), |
| 108 | "members": [ |
| 109 | { |
| 110 | "address": addr, |
| 111 | "kind": rec["kind"], |
| 112 | "language": language_of(addr.split("::")[0]), |
| 113 | "body_hash": rec["body_hash"][:8], |
| 114 | "signature_id": rec["signature_id"][:8], |
| 115 | "content_id": rec["content_id"][:8], |
| 116 | } |
| 117 | for addr, rec in self.members |
| 118 | ], |
| 119 | } |
| 120 | |
| 121 | |
| 122 | def find_clones( |
| 123 | root: pathlib.Path, |
| 124 | manifest: dict[str, str], |
| 125 | tier: CloneTier, |
| 126 | kind_filter: str | None, |
| 127 | min_cluster: int, |
| 128 | ) -> list[_CloneCluster]: |
| 129 | """Build clone clusters from *manifest*.""" |
| 130 | sym_map = symbols_for_snapshot(root, manifest, kind_filter=kind_filter) |
| 131 | |
| 132 | # Flatten to list of (address, record). |
| 133 | all_syms: list[tuple[str, SymbolRecord]] = [ |
| 134 | (addr, rec) |
| 135 | for _fp, tree in sorted(sym_map.items()) |
| 136 | for addr, rec in sorted(tree.items()) |
| 137 | if rec["kind"] != "import" |
| 138 | ] |
| 139 | |
| 140 | clusters: list[_CloneCluster] = [] |
| 141 | |
| 142 | if tier in ("exact", "both"): |
| 143 | body_index: dict[str, list[tuple[str, SymbolRecord]]] = {} |
| 144 | for addr, rec in all_syms: |
| 145 | body_index.setdefault(rec["body_hash"], []).append((addr, rec)) |
| 146 | for body_hash, members in sorted(body_index.items()): |
| 147 | if len(members) >= min_cluster: |
| 148 | clusters.append(_CloneCluster("exact", body_hash, members)) |
| 149 | |
| 150 | if tier in ("near", "both"): |
| 151 | sig_index: dict[str, list[tuple[str, SymbolRecord]]] = {} |
| 152 | for addr, rec in all_syms: |
| 153 | sig_index.setdefault(rec["signature_id"], []).append((addr, rec)) |
| 154 | for sig_id, members in sorted(sig_index.items()): |
| 155 | # Near-clone: same signature, at least two DIFFERENT body hashes. |
| 156 | unique_bodies = {r["body_hash"] for _, r in members} |
| 157 | if len(members) >= min_cluster and len(unique_bodies) > 1: |
| 158 | # Don't re-emit clusters already reported as exact clones. |
| 159 | clusters.append(_CloneCluster("near", sig_id, members)) |
| 160 | |
| 161 | # Sort: largest clusters first, then by tier, then by hash. |
| 162 | clusters.sort(key=lambda c: (-len(c.members), c.tier, c.hash_value)) |
| 163 | return clusters |
| 164 | |
| 165 | |
| 166 | @app.callback(invoke_without_command=True) |
| 167 | def clones( |
| 168 | ctx: typer.Context, |
| 169 | tier: str = typer.Option( |
| 170 | "both", "--tier", "-t", |
| 171 | help="Tier to report: exact, near, or both.", |
| 172 | ), |
| 173 | kind_filter: str | None = typer.Option( |
| 174 | None, "--kind", "-k", metavar="KIND", |
| 175 | help="Restrict to symbols of this kind.", |
| 176 | ), |
| 177 | min_cluster: int = typer.Option( |
| 178 | 2, "--min-cluster", "-m", metavar="N", |
| 179 | help="Only show clusters with at least N members.", |
| 180 | ), |
| 181 | ref: str | None = typer.Option( |
| 182 | None, "--commit", "-c", metavar="REF", |
| 183 | help="Analyse this commit instead of HEAD.", |
| 184 | ), |
| 185 | as_json: bool = typer.Option(False, "--json", help="Emit results as JSON."), |
| 186 | ) -> None: |
| 187 | """Find exact and near-duplicate symbols in the committed snapshot. |
| 188 | |
| 189 | Exact clones share the same ``body_hash`` (identical implementation). |
| 190 | Near-clones share the same ``signature_id`` but differ in body — same |
| 191 | contract, different implementation. Both tiers are candidates for |
| 192 | consolidation behind shared abstractions. |
| 193 | |
| 194 | Uses content-addressed hashes from the snapshot — no AST recomputation |
| 195 | or file parsing at query time. |
| 196 | """ |
| 197 | root = require_repo() |
| 198 | repo_id = _read_repo_id(root) |
| 199 | branch = _read_branch(root) |
| 200 | |
| 201 | if tier not in ("exact", "near", "both"): |
| 202 | typer.echo(f"❌ --tier must be 'exact', 'near', or 'both' (got: {tier!r})", err=True) |
| 203 | raise typer.Exit(code=ExitCode.USER_ERROR) |
| 204 | |
| 205 | commit = resolve_commit_ref(root, repo_id, branch, ref) |
| 206 | if commit is None: |
| 207 | typer.echo(f"❌ Commit '{ref or 'HEAD'}' not found.", err=True) |
| 208 | raise typer.Exit(code=ExitCode.USER_ERROR) |
| 209 | |
| 210 | manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {} |
| 211 | # Validated above — safe to narrow. |
| 212 | if tier == "exact": |
| 213 | cluster_list = find_clones(root, manifest, "exact", kind_filter, min_cluster) |
| 214 | elif tier == "near": |
| 215 | cluster_list = find_clones(root, manifest, "near", kind_filter, min_cluster) |
| 216 | else: |
| 217 | cluster_list = find_clones(root, manifest, "both", kind_filter, min_cluster) |
| 218 | |
| 219 | exact_clusters = [c for c in cluster_list if c.tier == "exact"] |
| 220 | near_clusters = [c for c in cluster_list if c.tier == "near"] |
| 221 | |
| 222 | if as_json: |
| 223 | typer.echo(json.dumps( |
| 224 | { |
| 225 | "schema_version": 1, |
| 226 | "commit": commit.commit_id[:8], |
| 227 | "tier": tier, |
| 228 | "min_cluster": min_cluster, |
| 229 | "kind_filter": kind_filter, |
| 230 | "exact_clone_clusters": len(exact_clusters), |
| 231 | "near_clone_clusters": len(near_clusters), |
| 232 | "clusters": [c.to_dict() for c in cluster_list], |
| 233 | }, |
| 234 | indent=2, |
| 235 | )) |
| 236 | return |
| 237 | |
| 238 | typer.echo(f"\nClone analysis — commit {commit.commit_id[:8]}") |
| 239 | if kind_filter: |
| 240 | typer.echo(f" (kind: {kind_filter})") |
| 241 | typer.echo("─" * 62) |
| 242 | |
| 243 | if not cluster_list: |
| 244 | typer.echo("\n ✅ No clones detected.") |
| 245 | return |
| 246 | |
| 247 | if exact_clusters and tier in ("exact", "both"): |
| 248 | typer.echo(f"\nExact clones ({len(exact_clusters)} cluster(s)):") |
| 249 | for cl in exact_clusters: |
| 250 | typer.echo(f" body_hash {cl.hash_value[:8]}:") |
| 251 | for addr, rec in cl.members: |
| 252 | typer.echo(f" {addr} {rec['kind']}") |
| 253 | |
| 254 | if near_clusters and tier in ("near", "both"): |
| 255 | typer.echo(f"\nNear-clones — same signature ({len(near_clusters)} cluster(s)):") |
| 256 | for cl in near_clusters: |
| 257 | typer.echo(f" signature_id {cl.hash_value[:8]}:") |
| 258 | for addr, rec in cl.members: |
| 259 | typer.echo(f" {addr} {rec['kind']} (body {rec['body_hash'][:8]})") |
| 260 | |
| 261 | total = sum(len(c.members) for c in cluster_list) |
| 262 | typer.echo(f"\n {len(cluster_list)} clone cluster(s), {total} total symbol(s) involved") |
| 263 | typer.echo(" Consider consolidating behind shared abstractions.") |