index_rebuild.py
python
| 1 | """muse index — manage and rebuild the optional local index layer. |
| 2 | |
| 3 | Indexes live under ``.muse/indices/`` and are fully derived from the commit |
| 4 | history. They are optional — all commands work without them, but indexes |
| 5 | dramatically accelerate repeated queries on large repositories. |
| 6 | |
| 7 | Available indexes |
| 8 | ----------------- |
| 9 | |
| 10 | ``symbol_history`` |
| 11 | Maps every symbol address to its full event timeline across all commits. |
| 12 | Reduces ``muse symbol-log``, ``muse lineage``, and ``muse query-history`` |
| 13 | from O(commits × files) to O(1) lookups. |
| 14 | |
| 15 | ``hash_occurrence`` |
| 16 | Maps every ``body_hash`` to the list of addresses that share it. |
| 17 | Reduces ``muse clones`` and ``muse find-symbol hash=`` to O(1). |
| 18 | |
| 19 | Sub-commands |
| 20 | ------------ |
| 21 | |
| 22 | ``muse index status`` |
| 23 | Show the status, entry count, and last-updated time of each index. |
| 24 | |
| 25 | ``muse index rebuild [--index NAME]`` |
| 26 | Rebuild one or all indexes by walking the entire commit history. |
| 27 | Safe to run multiple times. |
| 28 | |
| 29 | Usage:: |
| 30 | |
| 31 | muse index status |
| 32 | muse index status --json |
| 33 | muse index rebuild |
| 34 | muse index rebuild --json |
| 35 | muse index rebuild --index symbol_history |
| 36 | muse index rebuild --index hash_occurrence |
| 37 | |
| 38 | JSON output — ``muse index status --json``:: |
| 39 | |
| 40 | [ |
| 41 | {"name": "symbol_history", "status": "present", "entries": 1024, |
| 42 | "updated_at": "2026-03-21T12:00:00"}, |
| 43 | {"name": "hash_occurrence", "status": "absent", "entries": 0, |
| 44 | "updated_at": null} |
| 45 | ] |
| 46 | |
| 47 | JSON output — ``muse index rebuild --json``:: |
| 48 | |
| 49 | {"rebuilt": ["symbol_history", "hash_occurrence"], |
| 50 | "symbol_history_addresses": 512, "symbol_history_events": 2048, |
| 51 | "hash_occurrence_clusters": 31, "hash_occurrence_addresses": 87} |
| 52 | """ |
| 53 | |
| 54 | from __future__ import annotations |
| 55 | |
| 56 | import argparse |
| 57 | import json |
| 58 | import logging |
| 59 | import pathlib |
| 60 | import sys |
| 61 | |
| 62 | from muse.core.errors import ExitCode |
| 63 | from muse.core.indices import ( |
| 64 | HashOccurrenceIndex, |
| 65 | SymbolHistoryEntry, |
| 66 | SymbolHistoryIndex, |
| 67 | index_info, |
| 68 | load_hash_occurrence, |
| 69 | load_symbol_history, |
| 70 | save_hash_occurrence, |
| 71 | save_symbol_history, |
| 72 | ) |
| 73 | from muse.core.object_store import read_object |
| 74 | from muse.core.repo import require_repo |
| 75 | from muse.core.store import get_all_commits, get_commit_snapshot_manifest, read_current_branch |
| 76 | from muse.plugins.code._query import is_semantic |
| 77 | from muse.plugins.code.ast_parser import parse_symbols |
| 78 | |
| 79 | logger = logging.getLogger(__name__) |
| 80 | |
| 81 | |
| 82 | # --------------------------------------------------------------------------- |
| 83 | # Index build logic |
| 84 | # --------------------------------------------------------------------------- |
| 85 | |
| 86 | |
| 87 | def _build_symbol_history(root: pathlib.Path) -> SymbolHistoryIndex: |
| 88 | """Walk all commits oldest-first and build the symbol history index.""" |
| 89 | all_commits = sorted( |
| 90 | get_all_commits(root), |
| 91 | key=lambda c: c.committed_at, |
| 92 | ) |
| 93 | index: SymbolHistoryIndex = {} |
| 94 | |
| 95 | for commit in all_commits: |
| 96 | if commit.structured_delta is None: |
| 97 | continue |
| 98 | committed_at = commit.committed_at.isoformat() |
| 99 | ops = commit.structured_delta.get("ops", []) |
| 100 | |
| 101 | for op in ops: |
| 102 | if op["op"] != "patch": |
| 103 | continue |
| 104 | for child in op.get("child_ops", []): |
| 105 | addr = child["address"] |
| 106 | if "::" not in addr: |
| 107 | continue |
| 108 | file_path = addr.split("::")[0] |
| 109 | if not is_semantic(file_path): |
| 110 | continue |
| 111 | child_op = child["op"] |
| 112 | if child_op not in ("insert", "delete", "replace"): |
| 113 | continue |
| 114 | |
| 115 | # Extract hash fields: need to re-parse the snapshot blob for |
| 116 | # body_hash and signature_id (not stored in the delta). |
| 117 | manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {} |
| 118 | obj_id = manifest.get(file_path) |
| 119 | body_hash = "" |
| 120 | signature_id = "" |
| 121 | content_id = "" |
| 122 | if obj_id: |
| 123 | raw = read_object(root, obj_id) |
| 124 | if raw: |
| 125 | tree = parse_symbols(raw, file_path) |
| 126 | rec = tree.get(addr) |
| 127 | if rec: |
| 128 | body_hash = rec["body_hash"] |
| 129 | signature_id = rec["signature_id"] |
| 130 | content_id = rec["content_id"] |
| 131 | |
| 132 | if not content_id: |
| 133 | # Fall back to delta content_id using discriminated access. |
| 134 | if child_op == "insert" and child["op"] == "insert": |
| 135 | content_id = child["content_id"] |
| 136 | elif child_op == "delete" and child["op"] == "delete": |
| 137 | content_id = child["content_id"] |
| 138 | elif child_op == "replace" and child["op"] == "replace": |
| 139 | content_id = child["new_content_id"] |
| 140 | |
| 141 | entry = SymbolHistoryEntry( |
| 142 | commit_id=commit.commit_id, |
| 143 | committed_at=committed_at, |
| 144 | op=child_op, |
| 145 | content_id=content_id, |
| 146 | body_hash=body_hash, |
| 147 | signature_id=signature_id, |
| 148 | ) |
| 149 | index.setdefault(addr, []).append(entry) |
| 150 | |
| 151 | return index |
| 152 | |
| 153 | |
| 154 | def _build_hash_occurrence(root: pathlib.Path) -> HashOccurrenceIndex: |
| 155 | """Walk the HEAD snapshot and build the hash occurrence index.""" |
| 156 | # Determine HEAD commit. |
| 157 | head_ref_path = root / ".muse" / "HEAD" |
| 158 | if not head_ref_path.exists(): |
| 159 | return {} |
| 160 | head_ref = read_current_branch(root) |
| 161 | branch_ref_path = root / ".muse" / "refs" / "heads" / head_ref |
| 162 | if not branch_ref_path.exists(): |
| 163 | return {} |
| 164 | head_commit_id = branch_ref_path.read_text().strip() |
| 165 | |
| 166 | manifest = get_commit_snapshot_manifest(root, head_commit_id) or {} |
| 167 | index: HashOccurrenceIndex = {} |
| 168 | |
| 169 | for file_path, obj_id in sorted(manifest.items()): |
| 170 | if not is_semantic(file_path): |
| 171 | continue |
| 172 | raw = read_object(root, obj_id) |
| 173 | if raw is None: |
| 174 | continue |
| 175 | tree = parse_symbols(raw, file_path) |
| 176 | for addr, rec in tree.items(): |
| 177 | if rec["kind"] == "import": |
| 178 | continue |
| 179 | bh = rec["body_hash"] |
| 180 | index.setdefault(bh, []).append(addr) |
| 181 | |
| 182 | # Remove trivial (size-1) entries — they are not clones. |
| 183 | index = {h: addrs for h, addrs in index.items() if len(addrs) > 1} |
| 184 | return index |
| 185 | |
| 186 | |
| 187 | # --------------------------------------------------------------------------- |
| 188 | # Sub-commands |
| 189 | # --------------------------------------------------------------------------- |
| 190 | |
| 191 | |
| 192 | def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None: |
| 193 | """Register the index subcommand.""" |
| 194 | parser = subparsers.add_parser( |
| 195 | "index", |
| 196 | help="Manage the optional local index layer.", |
| 197 | description=__doc__, |
| 198 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 199 | ) |
| 200 | subs = parser.add_subparsers(dest="subcommand", metavar="SUBCOMMAND") |
| 201 | subs.required = True |
| 202 | |
| 203 | status_p = subs.add_parser("status", help="Show the status and entry count of each local index.") |
| 204 | status_p.add_argument("--json", dest="as_json", action="store_true", help="Emit index status as JSON.") |
| 205 | status_p.set_defaults(func=run_status) |
| 206 | |
| 207 | rebuild_p = subs.add_parser( |
| 208 | "rebuild", |
| 209 | help="Rebuild local indexes from the full commit history.", |
| 210 | description=( |
| 211 | "Rebuilds ``symbol_history`` and/or ``hash_occurrence`` indexes under " |
| 212 | "``.muse/indices/``. Safe to run multiple times — overwrites existing data.\n\n" |
| 213 | "Both indexes are derived entirely from the commit history and working " |
| 214 | "snapshots; the canonical storage is never modified." |
| 215 | ), |
| 216 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 217 | ) |
| 218 | rebuild_p.add_argument( |
| 219 | "--index", "-i", |
| 220 | dest="index_name", |
| 221 | default=None, |
| 222 | metavar="NAME", |
| 223 | help="Rebuild a specific index: symbol_history or hash_occurrence. Default: rebuild all.", |
| 224 | ) |
| 225 | rebuild_p.add_argument("--verbose", "-v", action="store_true", help="Show progress.") |
| 226 | rebuild_p.add_argument("--json", dest="as_json", action="store_true", help="Emit rebuild summary as JSON.") |
| 227 | rebuild_p.set_defaults(func=run_rebuild) |
| 228 | |
| 229 | |
| 230 | def run_status(args: argparse.Namespace) -> None: |
| 231 | """Show the status and entry count of each local index.""" |
| 232 | as_json: bool = args.as_json |
| 233 | |
| 234 | root = require_repo() |
| 235 | infos = index_info(root) |
| 236 | |
| 237 | if as_json: |
| 238 | out: list[dict[str, str | int | None]] = [] |
| 239 | for info in infos: |
| 240 | out.append({ |
| 241 | "name": info["name"], |
| 242 | "status": info["status"], |
| 243 | "entries": int(info.get("entries", 0)), |
| 244 | "updated_at": info.get("updated_at") or None, |
| 245 | }) |
| 246 | print(json.dumps(out, indent=2)) |
| 247 | return |
| 248 | |
| 249 | print("\nLocal index status:") |
| 250 | print("─" * 50) |
| 251 | for info in infos: |
| 252 | status = info["status"] |
| 253 | name = info["name"] |
| 254 | updated = info.get("updated_at", "")[:19] |
| 255 | entries = info.get("entries", "0") |
| 256 | if status == "present": |
| 257 | print(f" ✅ {name:<20} {entries:>8} entries (updated {updated})") |
| 258 | elif status == "absent": |
| 259 | print(f" ⬜ {name:<20} (not built — run: muse index rebuild)") |
| 260 | else: |
| 261 | print(f" ❌ {name:<20} corrupt — run: muse index rebuild") |
| 262 | print() |
| 263 | |
| 264 | |
| 265 | def run_rebuild(args: argparse.Namespace) -> None: |
| 266 | """Rebuild local indexes from the full commit history. |
| 267 | |
| 268 | Rebuilds ``symbol_history`` and/or ``hash_occurrence`` indexes under |
| 269 | ``.muse/indices/``. Safe to run multiple times — overwrites existing data. |
| 270 | |
| 271 | Both indexes are derived entirely from the commit history and working |
| 272 | snapshots; the canonical storage is never modified. |
| 273 | |
| 274 | Examples:: |
| 275 | |
| 276 | muse index rebuild |
| 277 | muse index rebuild --json |
| 278 | muse index rebuild --index symbol_history |
| 279 | muse index rebuild --index hash_occurrence --verbose |
| 280 | """ |
| 281 | index_name: str | None = args.index_name |
| 282 | verbose: bool = args.verbose |
| 283 | as_json: bool = args.as_json |
| 284 | |
| 285 | root = require_repo() |
| 286 | |
| 287 | if index_name is not None and index_name not in ("symbol_history", "hash_occurrence"): |
| 288 | print( |
| 289 | f"❌ Unknown index '{index_name}'. " |
| 290 | "Valid names: symbol_history, hash_occurrence.", |
| 291 | file=sys.stderr, |
| 292 | ) |
| 293 | raise SystemExit(ExitCode.USER_ERROR) |
| 294 | |
| 295 | build_all = index_name is None |
| 296 | built: list[str] = [] |
| 297 | result: dict[str, int | list[str]] = {} |
| 298 | |
| 299 | if build_all or index_name == "symbol_history": |
| 300 | if verbose and not as_json: |
| 301 | print("Building symbol_history index…") |
| 302 | idx = _build_symbol_history(root) |
| 303 | save_symbol_history(root, idx) |
| 304 | n_events = sum(len(evts) for evts in idx.values()) |
| 305 | result["symbol_history_addresses"] = len(idx) |
| 306 | result["symbol_history_events"] = n_events |
| 307 | if not as_json: |
| 308 | print(f" ✅ symbol_history — {len(idx)} addresses, {n_events} events") |
| 309 | built.append("symbol_history") |
| 310 | |
| 311 | if build_all or index_name == "hash_occurrence": |
| 312 | if verbose and not as_json: |
| 313 | print("Building hash_occurrence index…") |
| 314 | idx2 = _build_hash_occurrence(root) |
| 315 | save_hash_occurrence(root, idx2) |
| 316 | n_clones = sum(len(addrs) for addrs in idx2.values()) |
| 317 | result["hash_occurrence_clusters"] = len(idx2) |
| 318 | result["hash_occurrence_addresses"] = n_clones |
| 319 | if not as_json: |
| 320 | print(f" ✅ hash_occurrence — {len(idx2)} clone clusters, {n_clones} addresses") |
| 321 | built.append("hash_occurrence") |
| 322 | |
| 323 | result["rebuilt"] = built |
| 324 | |
| 325 | if as_json: |
| 326 | print(json.dumps(result, indent=2)) |
| 327 | return |
| 328 | |
| 329 | print(f"\nRebuilt {len(built)} index(es) under .muse/indices/") |
| 330 | print("Run 'muse index status' to verify.") |