gabriel / muse public
index_rebuild.py python
330 lines 11.7 KB
6acddccb fix: restore structured --help output for all CLI commands Gabriel Cardona <gabriel@tellurstori.com> 1d ago
1 """muse index — manage and rebuild the optional local index layer.
2
3 Indexes live under ``.muse/indices/`` and are fully derived from the commit
4 history. They are optional — all commands work without them, but indexes
5 dramatically accelerate repeated queries on large repositories.
6
7 Available indexes
8 -----------------
9
10 ``symbol_history``
11 Maps every symbol address to its full event timeline across all commits.
12 Reduces ``muse symbol-log``, ``muse lineage``, and ``muse query-history``
13 from O(commits × files) to O(1) lookups.
14
15 ``hash_occurrence``
16 Maps every ``body_hash`` to the list of addresses that share it.
17 Reduces ``muse clones`` and ``muse find-symbol hash=`` to O(1).
18
19 Sub-commands
20 ------------
21
22 ``muse index status``
23 Show the status, entry count, and last-updated time of each index.
24
25 ``muse index rebuild [--index NAME]``
26 Rebuild one or all indexes by walking the entire commit history.
27 Safe to run multiple times.
28
29 Usage::
30
31 muse index status
32 muse index status --json
33 muse index rebuild
34 muse index rebuild --json
35 muse index rebuild --index symbol_history
36 muse index rebuild --index hash_occurrence
37
38 JSON output — ``muse index status --json``::
39
40 [
41 {"name": "symbol_history", "status": "present", "entries": 1024,
42 "updated_at": "2026-03-21T12:00:00"},
43 {"name": "hash_occurrence", "status": "absent", "entries": 0,
44 "updated_at": null}
45 ]
46
47 JSON output — ``muse index rebuild --json``::
48
49 {"rebuilt": ["symbol_history", "hash_occurrence"],
50 "symbol_history_addresses": 512, "symbol_history_events": 2048,
51 "hash_occurrence_clusters": 31, "hash_occurrence_addresses": 87}
52 """
53
54 from __future__ import annotations
55
56 import argparse
57 import json
58 import logging
59 import pathlib
60 import sys
61
62 from muse.core.errors import ExitCode
63 from muse.core.indices import (
64 HashOccurrenceIndex,
65 SymbolHistoryEntry,
66 SymbolHistoryIndex,
67 index_info,
68 load_hash_occurrence,
69 load_symbol_history,
70 save_hash_occurrence,
71 save_symbol_history,
72 )
73 from muse.core.object_store import read_object
74 from muse.core.repo import require_repo
75 from muse.core.store import get_all_commits, get_commit_snapshot_manifest, read_current_branch
76 from muse.plugins.code._query import is_semantic
77 from muse.plugins.code.ast_parser import parse_symbols
78
79 logger = logging.getLogger(__name__)
80
81
82 # ---------------------------------------------------------------------------
83 # Index build logic
84 # ---------------------------------------------------------------------------
85
86
87 def _build_symbol_history(root: pathlib.Path) -> SymbolHistoryIndex:
88 """Walk all commits oldest-first and build the symbol history index."""
89 all_commits = sorted(
90 get_all_commits(root),
91 key=lambda c: c.committed_at,
92 )
93 index: SymbolHistoryIndex = {}
94
95 for commit in all_commits:
96 if commit.structured_delta is None:
97 continue
98 committed_at = commit.committed_at.isoformat()
99 ops = commit.structured_delta.get("ops", [])
100
101 for op in ops:
102 if op["op"] != "patch":
103 continue
104 for child in op.get("child_ops", []):
105 addr = child["address"]
106 if "::" not in addr:
107 continue
108 file_path = addr.split("::")[0]
109 if not is_semantic(file_path):
110 continue
111 child_op = child["op"]
112 if child_op not in ("insert", "delete", "replace"):
113 continue
114
115 # Extract hash fields: need to re-parse the snapshot blob for
116 # body_hash and signature_id (not stored in the delta).
117 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
118 obj_id = manifest.get(file_path)
119 body_hash = ""
120 signature_id = ""
121 content_id = ""
122 if obj_id:
123 raw = read_object(root, obj_id)
124 if raw:
125 tree = parse_symbols(raw, file_path)
126 rec = tree.get(addr)
127 if rec:
128 body_hash = rec["body_hash"]
129 signature_id = rec["signature_id"]
130 content_id = rec["content_id"]
131
132 if not content_id:
133 # Fall back to delta content_id using discriminated access.
134 if child_op == "insert" and child["op"] == "insert":
135 content_id = child["content_id"]
136 elif child_op == "delete" and child["op"] == "delete":
137 content_id = child["content_id"]
138 elif child_op == "replace" and child["op"] == "replace":
139 content_id = child["new_content_id"]
140
141 entry = SymbolHistoryEntry(
142 commit_id=commit.commit_id,
143 committed_at=committed_at,
144 op=child_op,
145 content_id=content_id,
146 body_hash=body_hash,
147 signature_id=signature_id,
148 )
149 index.setdefault(addr, []).append(entry)
150
151 return index
152
153
154 def _build_hash_occurrence(root: pathlib.Path) -> HashOccurrenceIndex:
155 """Walk the HEAD snapshot and build the hash occurrence index."""
156 # Determine HEAD commit.
157 head_ref_path = root / ".muse" / "HEAD"
158 if not head_ref_path.exists():
159 return {}
160 head_ref = read_current_branch(root)
161 branch_ref_path = root / ".muse" / "refs" / "heads" / head_ref
162 if not branch_ref_path.exists():
163 return {}
164 head_commit_id = branch_ref_path.read_text().strip()
165
166 manifest = get_commit_snapshot_manifest(root, head_commit_id) or {}
167 index: HashOccurrenceIndex = {}
168
169 for file_path, obj_id in sorted(manifest.items()):
170 if not is_semantic(file_path):
171 continue
172 raw = read_object(root, obj_id)
173 if raw is None:
174 continue
175 tree = parse_symbols(raw, file_path)
176 for addr, rec in tree.items():
177 if rec["kind"] == "import":
178 continue
179 bh = rec["body_hash"]
180 index.setdefault(bh, []).append(addr)
181
182 # Remove trivial (size-1) entries — they are not clones.
183 index = {h: addrs for h, addrs in index.items() if len(addrs) > 1}
184 return index
185
186
187 # ---------------------------------------------------------------------------
188 # Sub-commands
189 # ---------------------------------------------------------------------------
190
191
192 def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None:
193 """Register the index subcommand."""
194 parser = subparsers.add_parser(
195 "index",
196 help="Manage the optional local index layer.",
197 description=__doc__,
198 formatter_class=argparse.RawDescriptionHelpFormatter,
199 )
200 subs = parser.add_subparsers(dest="subcommand", metavar="SUBCOMMAND")
201 subs.required = True
202
203 status_p = subs.add_parser("status", help="Show the status and entry count of each local index.")
204 status_p.add_argument("--json", dest="as_json", action="store_true", help="Emit index status as JSON.")
205 status_p.set_defaults(func=run_status)
206
207 rebuild_p = subs.add_parser(
208 "rebuild",
209 help="Rebuild local indexes from the full commit history.",
210 description=(
211 "Rebuilds ``symbol_history`` and/or ``hash_occurrence`` indexes under "
212 "``.muse/indices/``. Safe to run multiple times — overwrites existing data.\n\n"
213 "Both indexes are derived entirely from the commit history and working "
214 "snapshots; the canonical storage is never modified."
215 ),
216 formatter_class=argparse.RawDescriptionHelpFormatter,
217 )
218 rebuild_p.add_argument(
219 "--index", "-i",
220 dest="index_name",
221 default=None,
222 metavar="NAME",
223 help="Rebuild a specific index: symbol_history or hash_occurrence. Default: rebuild all.",
224 )
225 rebuild_p.add_argument("--verbose", "-v", action="store_true", help="Show progress.")
226 rebuild_p.add_argument("--json", dest="as_json", action="store_true", help="Emit rebuild summary as JSON.")
227 rebuild_p.set_defaults(func=run_rebuild)
228
229
230 def run_status(args: argparse.Namespace) -> None:
231 """Show the status and entry count of each local index."""
232 as_json: bool = args.as_json
233
234 root = require_repo()
235 infos = index_info(root)
236
237 if as_json:
238 out: list[dict[str, str | int | None]] = []
239 for info in infos:
240 out.append({
241 "name": info["name"],
242 "status": info["status"],
243 "entries": int(info.get("entries", 0)),
244 "updated_at": info.get("updated_at") or None,
245 })
246 print(json.dumps(out, indent=2))
247 return
248
249 print("\nLocal index status:")
250 print("─" * 50)
251 for info in infos:
252 status = info["status"]
253 name = info["name"]
254 updated = info.get("updated_at", "")[:19]
255 entries = info.get("entries", "0")
256 if status == "present":
257 print(f" ✅ {name:<20} {entries:>8} entries (updated {updated})")
258 elif status == "absent":
259 print(f" ⬜ {name:<20} (not built — run: muse index rebuild)")
260 else:
261 print(f" ❌ {name:<20} corrupt — run: muse index rebuild")
262 print()
263
264
265 def run_rebuild(args: argparse.Namespace) -> None:
266 """Rebuild local indexes from the full commit history.
267
268 Rebuilds ``symbol_history`` and/or ``hash_occurrence`` indexes under
269 ``.muse/indices/``. Safe to run multiple times — overwrites existing data.
270
271 Both indexes are derived entirely from the commit history and working
272 snapshots; the canonical storage is never modified.
273
274 Examples::
275
276 muse index rebuild
277 muse index rebuild --json
278 muse index rebuild --index symbol_history
279 muse index rebuild --index hash_occurrence --verbose
280 """
281 index_name: str | None = args.index_name
282 verbose: bool = args.verbose
283 as_json: bool = args.as_json
284
285 root = require_repo()
286
287 if index_name is not None and index_name not in ("symbol_history", "hash_occurrence"):
288 print(
289 f"❌ Unknown index '{index_name}'. "
290 "Valid names: symbol_history, hash_occurrence.",
291 file=sys.stderr,
292 )
293 raise SystemExit(ExitCode.USER_ERROR)
294
295 build_all = index_name is None
296 built: list[str] = []
297 result: dict[str, int | list[str]] = {}
298
299 if build_all or index_name == "symbol_history":
300 if verbose and not as_json:
301 print("Building symbol_history index…")
302 idx = _build_symbol_history(root)
303 save_symbol_history(root, idx)
304 n_events = sum(len(evts) for evts in idx.values())
305 result["symbol_history_addresses"] = len(idx)
306 result["symbol_history_events"] = n_events
307 if not as_json:
308 print(f" ✅ symbol_history — {len(idx)} addresses, {n_events} events")
309 built.append("symbol_history")
310
311 if build_all or index_name == "hash_occurrence":
312 if verbose and not as_json:
313 print("Building hash_occurrence index…")
314 idx2 = _build_hash_occurrence(root)
315 save_hash_occurrence(root, idx2)
316 n_clones = sum(len(addrs) for addrs in idx2.values())
317 result["hash_occurrence_clusters"] = len(idx2)
318 result["hash_occurrence_addresses"] = n_clones
319 if not as_json:
320 print(f" ✅ hash_occurrence — {len(idx2)} clone clusters, {n_clones} addresses")
321 built.append("hash_occurrence")
322
323 result["rebuilt"] = built
324
325 if as_json:
326 print(json.dumps(result, indent=2))
327 return
328
329 print(f"\nRebuilt {len(built)} index(es) under .muse/indices/")
330 print("Run 'muse index status' to verify.")