gabriel / muse public
stat_cache.py python
368 lines 13.4 KB
b7e24576 feat: stat cache — 86x faster muse status Gabriel Cardona <gabriel@tellurstori.com> 4d ago
1 """Stat-based file hash cache — fast snapshot computation for all domains.
2
3 Architecture
4 ------------
5 Every ``plugin.snapshot()`` call must hash every tracked file to detect
6 changes. On a repository with hundreds of files this is the dominant cost of
7 ``muse status``, ``muse diff``, and any command that calls ``snapshot()``.
8
9 ``StatCache`` eliminates redundant I/O by persisting two classes of hash per
10 file between invocations:
11
12 Object hash
13 SHA-256 of raw bytes. Used by the content-addressed object store.
14 Recomputed only when ``(mtime, size)`` changes.
15
16 Dimension hashes
17 Domain-specific semantic hashes. For the code domain these might be the
18 SHA-256 of the AST symbol set, the import set, and so on. For the MIDI
19 domain they might be the hash of parsed note events, tempo map, and
20 harmony analysis. Populated by domain plugins after parsing; consumed by
21 ``diff()`` and ``merge()`` to skip re-parsing unchanged files entirely.
22
23 An empty ``dimensions`` dict means no semantic hashes are cached yet —
24 this is the baseline state and is always safe.
25
26 Cache validity
27 --------------
28 A cache entry is valid when the file's current ``st_mtime`` and ``st_size``
29 exactly match the stored values — the same contract Git's index uses. The
30 cache is **self-healing**: writing a file (e.g. ``muse checkout``) always
31 updates ``mtime``, causing a cache miss on the next scan.
32
33 Known corner-case ("racy Muse"): a file modified within the same filesystem
34 timestamp quantum *and* with identical size could be served a stale cache
35 entry. This is identical to "racy git" and is not defended against here.
36
37 Storage
38 -------
39 ``.muse/stat_cache.json`` — a versioned JSON document::
40
41 {
42 "version": 1,
43 "entries": {
44 "muse/core/snapshot.py": {
45 "mtime": 1710000000.123456,
46 "size": 4321,
47 "object_hash": "<sha256-of-raw-bytes>",
48 "dimensions": {
49 "symbols": "<sha256-of-ast-symbol-set>",
50 "imports": "<sha256-of-import-set>"
51 }
52 }
53 }
54 }
55
56 Writes are atomic: data is flushed to a ``.tmp`` sibling then renamed over
57 the target, so a crash mid-write never corrupts the cache.
58 """
59
60 from __future__ import annotations
61
62 import hashlib
63 import json
64 import logging
65 import pathlib
66 from typing import TypedDict
67
68 logger = logging.getLogger(__name__)
69
70 _CACHE_VERSION = 1
71 _CACHE_FILENAME = "stat_cache.json"
72 _CHUNK = 65_536
73
74
75 class FileCacheEntry(TypedDict):
76 """Persisted metadata for a single workspace file."""
77
78 mtime: float
79 size: int
80 object_hash: str
81 # Domain plugins write semantic hashes here after parsing.
82 # Keys are dimension names ("symbols", "imports", "notes", …).
83 # Empty dict == no dimension hashes cached yet; always safe to return None.
84 dimensions: dict[str, str]
85
86
87 class _CacheDoc(TypedDict):
88 """On-disk JSON document shape."""
89
90 version: int
91 entries: dict[str, FileCacheEntry]
92
93
94 def _hash_bytes(path: pathlib.Path) -> str:
95 """Return the SHA-256 hex digest of *path*'s raw bytes.
96
97 Reads in 64 KiB chunks so memory usage is constant regardless of file size.
98 This is the single canonical implementation shared by the cache and all
99 domain plugins — no more duplicated ``_hash_file`` helpers.
100 """
101 return _hash_str(str(path))
102
103
104 def _hash_str(path_str: str) -> str:
105 """String-path variant of ``_hash_bytes`` — avoids constructing a Path object.
106
107 Used in the hot inner loop of ``walk_workdir`` and plugin snapshot methods
108 where the file path is already a plain string from ``os.walk``.
109 """
110 h = hashlib.sha256()
111 with open(path_str, "rb") as fh:
112 for chunk in iter(lambda: fh.read(_CHUNK), b""):
113 h.update(chunk)
114 return h.hexdigest()
115
116
117 class StatCache:
118 """Shared stat-based hash cache for all domain plugin ``snapshot()`` calls.
119
120 Typical lifecycle inside a plugin's ``snapshot()``::
121
122 cache = StatCache.load(root / ".muse")
123 for file_path in walk(...):
124 files[rel] = cache.get_object_hash(root, file_path)
125 cache.prune(set(files))
126 cache.save()
127
128 The same instance can be passed to ``diff()`` or ``merge()`` logic to
129 retrieve already-computed dimension hashes without re-parsing files.
130 """
131
132 def __init__(
133 self, muse_dir: pathlib.Path | None, entries: dict[str, FileCacheEntry]
134 ) -> None:
135 self._muse_dir = muse_dir
136 self._entries = entries
137 self._dirty = False
138
139 # ------------------------------------------------------------------
140 # Construction
141 # ------------------------------------------------------------------
142
143 @classmethod
144 def load(cls, muse_dir: pathlib.Path) -> StatCache:
145 """Load the cache from *muse_dir*/stat_cache.json.
146
147 Validates the version field and every entry's field types on load so
148 a corrupt or future-format file never poisons the cache. Returns a
149 fresh empty cache if the file is absent, unreadable, or version
150 mismatches — never raises.
151
152 Parsing is done inline (not via a typed helper) so that isinstance
153 checks narrow from ``Any`` — the type returned by ``json.loads`` —
154 giving mypy accurate control-flow narrowing without unreachable-branch
155 false positives.
156 """
157 cache_file = muse_dir / _CACHE_FILENAME
158 if cache_file.is_file():
159 try:
160 raw = json.loads(cache_file.read_text(encoding="utf-8"))
161 if not (isinstance(raw, dict) and raw.get("version") == _CACHE_VERSION):
162 return cls(muse_dir, {})
163 raw_entries = raw.get("entries")
164 if not isinstance(raw_entries, dict):
165 return cls(muse_dir, {})
166 entries: dict[str, FileCacheEntry] = {}
167 for rel, ev in raw_entries.items():
168 if not isinstance(rel, str) or not isinstance(ev, dict):
169 continue
170 mtime = ev.get("mtime")
171 size = ev.get("size")
172 obj_hash = ev.get("object_hash")
173 dims = ev.get("dimensions")
174 if not (
175 isinstance(mtime, (int, float))
176 and isinstance(size, int)
177 and isinstance(obj_hash, str)
178 and isinstance(dims, dict)
179 ):
180 continue
181 entries[rel] = FileCacheEntry(
182 mtime=float(mtime),
183 size=size,
184 object_hash=obj_hash,
185 # Coerce dimension keys/values to str — guards against
186 # a cache written by a future version with non-str values.
187 dimensions={str(k): str(v) for k, v in dims.items()},
188 )
189 return cls(muse_dir, entries)
190 except (json.JSONDecodeError, KeyError, TypeError):
191 logger.debug("⚠️ stat_cache.json unreadable — starting fresh")
192 return cls(muse_dir, {})
193
194 @classmethod
195 def empty(cls) -> StatCache:
196 """Return a no-op cache for contexts without a ``.muse`` directory."""
197 return cls(None, {})
198
199 # ------------------------------------------------------------------
200 # Object hash — raw-bytes SHA-256
201 # ------------------------------------------------------------------
202
203 def get_cached(
204 self, rel: str, abs_path_str: str, mtime: float, size: int
205 ) -> str:
206 """Fast inner-loop hash lookup with pre-computed stat values.
207
208 Callers that already have ``(mtime, size)`` from an ``os.stat`` or
209 ``os.walk`` call should use this method to avoid a redundant
210 ``stat()`` syscall inside :meth:`get_object_hash`.
211
212 Args:
213 rel: Workspace-relative POSIX path (cache key).
214 abs_path_str: Absolute file path as a plain string — avoids
215 constructing a ``pathlib.Path`` in the hot loop.
216 mtime: ``st_mtime`` from the caller's stat result.
217 size: ``st_size`` from the caller's stat result.
218
219 Returns:
220 64-character lowercase hex SHA-256 digest.
221 """
222 entry = self._entries.get(rel)
223 if entry is not None and entry["mtime"] == mtime and entry["size"] == size:
224 return entry["object_hash"]
225
226 obj_hash = _hash_str(abs_path_str)
227 self._entries[rel] = FileCacheEntry(
228 mtime=mtime,
229 size=size,
230 object_hash=obj_hash,
231 dimensions={},
232 )
233 self._dirty = True
234 return obj_hash
235
236 def get_object_hash(self, root: pathlib.Path, file_path: pathlib.Path) -> str:
237 """Return the SHA-256 of *file_path*, using the cache when valid.
238
239 Convenience wrapper around :meth:`get_cached` for callers that work
240 with ``pathlib.Path`` objects. The hot inner loops of ``walk_workdir``
241 and plugin snapshot methods call :meth:`get_cached` directly to skip
242 pathlib overhead.
243
244 Args:
245 root: Repository root — used to compute the workspace-relative
246 POSIX key.
247 file_path: Absolute path to the file.
248
249 Returns:
250 64-character lowercase hex SHA-256 digest.
251 """
252 rel = file_path.relative_to(root).as_posix()
253 st = file_path.stat()
254 return self.get_cached(rel, str(file_path), st.st_mtime, st.st_size)
255
256 # ------------------------------------------------------------------
257 # Dimension hashes — domain-specific semantic hashes
258 # ------------------------------------------------------------------
259
260 def get_dimension(
261 self,
262 root: pathlib.Path,
263 file_path: pathlib.Path,
264 dimension: str,
265 ) -> str | None:
266 """Return a cached dimension hash, or ``None`` if not yet computed.
267
268 Callers must verify that the entry is still valid by checking that
269 the object hash hasn't changed (i.e. call ``get_object_hash`` first
270 to ensure the entry is fresh).
271
272 Args:
273 root: Repository root.
274 file_path: Absolute path to the file.
275 dimension: Dimension name, e.g. ``"symbols"`` or ``"notes"``.
276
277 Returns:
278 Cached hash string, or ``None`` if absent.
279 """
280 rel = file_path.relative_to(root).as_posix()
281 entry = self._entries.get(rel)
282 if entry is None:
283 return None
284 return entry["dimensions"].get(dimension)
285
286 def set_dimension(
287 self,
288 root: pathlib.Path,
289 file_path: pathlib.Path,
290 dimension: str,
291 hash_value: str,
292 ) -> None:
293 """Store a semantic hash for a specific dimension of *file_path*.
294
295 Should be called by domain plugins after parsing a file whose object
296 hash triggered a cache miss. Silently ignored if the file has no
297 entry (which should not happen in normal operation).
298
299 Args:
300 root: Repository root.
301 file_path: Absolute path to the file.
302 dimension: Dimension name, e.g. ``"symbols"``.
303 hash_value: Hash string to store.
304 """
305 rel = file_path.relative_to(root).as_posix()
306 entry = self._entries.get(rel)
307 if entry is None:
308 return
309 entry["dimensions"][dimension] = hash_value
310 self._dirty = True
311
312 # ------------------------------------------------------------------
313 # Lifecycle helpers
314 # ------------------------------------------------------------------
315
316 def prune(self, known_paths: set[str]) -> None:
317 """Remove entries for paths no longer present in the working tree.
318
319 Call this after a full directory walk, passing the set of
320 workspace-relative POSIX paths that were found. Keeps the cache
321 lean by evicting stale entries for deleted files.
322
323 Args:
324 known_paths: Set of rel-posix paths observed during the walk.
325 """
326 stale = set(self._entries) - known_paths
327 if stale:
328 for k in stale:
329 del self._entries[k]
330 self._dirty = True
331
332 def save(self) -> None:
333 """Atomically persist the cache to disk if it has changed.
334
335 Uses a temp-file-then-rename pattern so a crash mid-write never
336 leaves a corrupt cache file. Silently skips when there is no
337 ``.muse`` directory (e.g. in-memory unit tests).
338 """
339 if not self._dirty or self._muse_dir is None:
340 return
341 doc = _CacheDoc(version=_CACHE_VERSION, entries=self._entries)
342 cache_file = self._muse_dir / _CACHE_FILENAME
343 tmp = self._muse_dir / (_CACHE_FILENAME + ".tmp")
344 tmp.write_text(
345 json.dumps(doc, separators=(",", ":"), ensure_ascii=False),
346 encoding="utf-8",
347 )
348 tmp.replace(cache_file)
349 self._dirty = False
350 logger.debug("✅ stat_cache saved (%d entries)", len(self._entries))
351
352
353 def load_cache(root: pathlib.Path) -> StatCache:
354 """Convenience loader: return a ``StatCache`` for a repository root.
355
356 Returns ``StatCache.empty()`` when *root* has no ``.muse`` directory
357 so callers never need to guard against a missing repo.
358
359 Args:
360 root: Repository root (the directory that contains ``.muse/``).
361
362 Returns:
363 A ``StatCache`` instance ready for use.
364 """
365 muse_dir = root / ".muse"
366 if muse_dir.is_dir():
367 return StatCache.load(muse_dir)
368 return StatCache.empty()